In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score as CV

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Preprocessing

In [3]:
train = pd.read_csv('train.csv', sep=',')
test = pd.read_csv('test.csv', sep=',')
le = LabelEncoder()

In [4]:
train.index = list(train['id'])
train.index.name = 'ID'
del train['id']

test.index = list(test['id'])
test.index.name = 'ID'
del test['id']

In [5]:
le.fit(test.gender)
test['gender'] = le.transform(test.gender)

In [6]:
le.fit(train.gender)
train['gender'] = le.transform(train.gender)

In [7]:
le.fit(test.smoking_status.fillna('never smoked'))
test['smoking_status'] = le.transform(test.smoking_status.fillna('never smoked'))

In [8]:
le.fit(train.smoking_status.fillna('never smoked'))
train['smoking_status'] = le.transform(train.smoking_status.fillna('never smoked'))

###### "ever_married",  "work_type" and	"Residence_type" are more inderect features for stroke prediction. So it's better to use features, that have more unambiguous influence on health.

In [9]:
features = ["gender", "age", "hypertension", "heart_disease", "avg_glucose_level", "bmi", "smoking_status"]

In [10]:
list(train.stroke).count(0)

29471

In [11]:
list(train.stroke).count(1)

530

too less strokes

In [12]:
train = train.sort_values('stroke', ascending = False).head(1000)

In [13]:
Test = test[features]
Train = train[features]
y = train['stroke']

In [14]:
norm = normalize(SimpleImputer().fit_transform(Train))
T_rain = pd.DataFrame(norm)
T_rain.columns = Test.columns
T_rain.index = Train.index

In [15]:
norm = normalize(SimpleImputer().fit_transform(Test))
T_est = pd.DataFrame(norm)
T_est.columns = Test.columns
T_est.index = Test.index

## LogReg

In [16]:
from sklearn.linear_model import LogisticRegression

#### solver test

In [17]:
def solver_test(solve, X, y):
    LogReg = LogisticRegression(solver = solve, class_weight = 'balanced')
    return CV(LogReg, X, y, cv = 10).mean() 

In [18]:
for s in ["newton-cg", "sag", "lbfgs", "liblinear", "saga"]:
    print(s,': ', solver_test(s, T_rain, y))
    print()

newton-cg :  0.7129999999999999

sag :  0.7129999999999999

lbfgs :  0.7129999999999999

liblinear :  0.7140000000000001

saga :  0.7129999999999999



#### C test

In [19]:
def C_test(c, X, y):
    LogReg = LogisticRegression(C = c, solver='liblinear', class_weight = 'balanced')
    return CV(LogReg, X, y, cv = 10).mean() 

In [20]:
for c in [1.0, 0.5, 3, 42, 1.5]:
    print(c,': ', C_test(c, T_rain, y))
    print()

1.0 :  0.7140000000000001

0.5 :  0.69

3 :  0.731

42 :  0.736

1.5 :  0.7210000000000001



#### random_state test (Used when solver == ‘sag’, ‘saga’ or ‘liblinear’)

In [21]:
def random_test(state, X, y):
    LogReg = LogisticRegression(C= 42, solver = "liblinear", random_state = state, class_weight = 'balanced')
    return CV(LogReg, X, y, cv = 10).mean() 

In [22]:
for r in [7, 69, 3, 42, 28]:
    print(r,': ', random_test(r, T_rain, y))
    print()

7 :  0.736

69 :  0.736

3 :  0.736

42 :  0.736

28 :  0.736



In [23]:
LogReg = LogReg = LogisticRegression(C= 42, solver = "liblinear", random_state = 69, class_weight = 'balanced').fit(T_rain,y)
result = pd.DataFrame(LogReg.predict(T_est))
result.columns = ['stroke']
result.index = T_est.index
result.to_csv('LogReg.csv', sep=',', 
                  index_label = 'Id', header = True)

##### kaggle score about 0.77534

## Decision tree

In [24]:
from sklearn.tree import DecisionTreeClassifier

#### max_depth test

In [25]:
def depth_test(depth, X, y):
    tree = DecisionTreeClassifier(max_depth = depth, class_weight = 'balanced')
    return CV(tree, X, y, cv = 10).mean() 

In [26]:
for d in [20, 50, 100, 200, 500, 800, 1000]:
    print(d,': ', depth_test(d, T_rain, y))
    print()

20 :  0.68

50 :  0.683

100 :  0.686

200 :  0.6849999999999999

500 :  0.683

800 :  0.6859999999999999

1000 :  0.688



#### min_samples_split test

In [27]:
def samples_test(min_split, X, y):
    tree = DecisionTreeClassifier(max_depth = 100, min_samples_split=min_split, class_weight = 'balanced')
    return CV(tree, X, y, cv = 10).mean() 

In [28]:
for ms in [5, 10, 25, 50, 80]:
    print(ms,': ', samples_test(ms, T_rain, y))
    print()

5 :  0.683

10 :  0.6799999999999999

25 :  0.692

50 :  0.7060000000000001

80 :  0.7209999999999999



#### max_leaf_nodes

In [29]:
def nodes_test(node, X, y):
    tree = DecisionTreeClassifier(max_depth = 100, min_samples_split=80, max_leaf_nodes = node, class_weight = 'balanced')
    return CV(tree, X, y, cv = 10).mean() 

In [30]:
for n in [20, 50, 100, 200, 500, 800, 1000]:
    print(n,': ', nodes_test(n, T_rain, y))
    print()

20 :  0.7230000000000001

50 :  0.7209999999999999

100 :  0.7209999999999999

200 :  0.7209999999999999

500 :  0.7209999999999999

800 :  0.7209999999999999

1000 :  0.7209999999999999



In [31]:
tree = DecisionTreeClassifier(max_depth = 100, min_samples_split=100, max_leaf_nodes = 20, class_weight = 'balanced').fit(T_rain,y)

In [32]:
result = pd.DataFrame(tree.predict(T_est))
result.columns = ['stroke']
result.index = T_est.index
result.to_csv('tree.csv', sep=',', 
                  index_label = 'Id', header = True)

In [33]:
list(result['stroke']).count(1)

4210

##### kaggle score 0.76418

## Random forest

In [34]:
from sklearn.ensemble import RandomForestClassifier as RF

#### n_estimators test

In [35]:
def estimate_test(est, X, y):
    tree = RF(n_estimators =est, class_weight = "balanced")
    return CV(tree, X, y, cv = 10).mean() 

In [36]:
for e in [20, 50, 100, 200, 500, 800, 1000]:
    print(e,': ', estimate_test(e, T_rain, y))
    print()

20 :  0.734

50 :  0.757

100 :  0.747

200 :  0.746

500 :  0.746

800 :  0.743

1000 :  0.7500000000000001



#### min_weight_fraction_leaf test

In [37]:
def fraction_test(weight, X, y):
    tree = RF(n_estimators = 800, min_weight_fraction_leaf = weight, class_weight = "balanced")
    return CV(tree, X, y, cv = 10).mean() 

In [38]:
for w in [0, 0.1, 0.2, 0.25, 0.5]:
    print(w,': ', fraction_test(w, T_rain, y))
    print()

0 :  0.749

0.1 :  0.7470000000000001

0.2 :  0.7300000000000001

0.25 :  0.719

0.5 :  0.518



#### min_impurity_split test

In [39]:
def impurity_test(split, X, y):
    tree = RF(n_estimators = 800, min_impurity_split = split, class_weight = "balanced")
    return CV(tree, X, y, cv = 10).mean() 

In [40]:
for i in [0, 0.1, 0.2, 0.25, 0.5]:
    print(i,': ', impurity_test(i, T_rain, y))
    print()

0 :  0.75

0.1 :  0.75

0.2 :  0.7529999999999999

0.25 :  0.754

0.5 :  0.49399999999999994



In [41]:
tree = RF(n_estimators = 800, min_impurity_split = 0.25, class_weight = "balanced").fit(T_rain,y)

In [42]:
result = pd.DataFrame(tree.predict(T_est))
result.columns = ['stroke']
result.index = T_est.index
result.to_csv('RF.csv', sep=',', 
                  index_label = 'Id', header = True)

In [43]:
list(result['stroke']).count(1)

4096

##### Kaggle score 0.77820

# Люблю леса