In [None]:
"""
import pandas as pd
from sklearn.model_selection import train_test_split
import os

from settings.constants import TRAIN_CSV, VAL_CSV 

df = pd.read_csv(TRAIN_CSV, header = 0, dtype={'Age': np.float64})

train_df, val_df = train_test_split(
     df,
     test_size=0.2,
     stratify=df["Survived"],  
     random_state=42
)

train_df.to_csv(TRAIN_CSV, index=False)
val_df.to_csv(VAL_CSV, index=False)
"""

In [None]:
import numpy as np
import re as re
import pandas as pd

from settings.constants import TRAIN_CSV, VAL_CSV 

train = pd.read_csv(TRAIN_CSV, header = 0, dtype={'Age': np.float64})
val  = pd.read_csv(VAL_CSV , header = 0, dtype={'Age': np.float64})
full_data = [train, val]

train.head()

In [None]:
train.columns

In [None]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index = False).mean()


In [None]:
train[["Sex", "Survived"]].groupby(['Sex'], as_index = False).mean()


In [None]:
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index = False).mean()


In [None]:
train['IsAlone'] = 0
train.loc[train['FamilySize'] == 1, 'IsAlone'] = 1
train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()


In [None]:
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean()

In [None]:
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()

In [None]:
age_avg = train['Age'].mean()
age_std = train['Age'].std()    
age_null_count = train['Age'].isnull().sum()
    
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
train['Age'][np.isnan(train['Age'])] = age_null_random_list
train['Age'] = train['Age'].astype(int)
    
train['CategoricalAge'] = pd.cut(train['Age'], 5)

train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()

In [None]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)

pd.crosstab(train['Title'], train['Sex'])

In [None]:
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
"""
from sklearn.preprocessing import LabelEncoder

# encode labels
le = LabelEncoder()

le.fit(train['Sex'])
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Title'])
train['Title'] = le.transform(train['Title'])

le.fit(train['Embarked'].values)
train['Embarked'] = le.transform(train['Embarked'].values)

le.fit(train['Fare'])
train['Fare'] = le.transform(train['Fare'])

le.fit(train['Age'])
train['Age'] = le.transform(train['Age'])
"""

### Models evaluation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

acc_dict = {}

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_predictions = clf.predict(X_test)
        acc = accuracy_score(y_test, train_predictions)
        
        if name in acc_dict:
            acc_dict[name] += acc
        else:
            acc_dict[name] = acc

# Some method have been deprecated, so code should be changed 
log_entries = []

for clf in acc_dict:
    acc_dict[clf] /= 10.0
    log_entries.append([clf, acc_dict[clf]])

log = pd.DataFrame(log_entries, columns=log_cols)
# Ends here 

plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x = 'Accuracy', y = 'Classifier', data = log, color = "b")
log

### Using RandomForest

In [None]:
# Saving model

import pickle
import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from os import getcwd

from utils.dataloader import DataLoader 
from settings.constants import TRAIN_CSV


with open('settings/specifications.json') as f:
    specifications = json.load(f)

raw_train = pd.read_csv(TRAIN_CSV)
x_columns = specifications['description']['X']
y_column = specifications['description']['y']

X_raw = raw_train[x_columns]

loader = DataLoader()
loader.fit(X_raw)
X = loader.load_data()
y = raw_train.stroke

model = RandomForestClassifier()
model.fit(X, y)
with open(getcwd() + '/models/RandForest.pickle', 'wb')as f:
    pickle.dump(model, f)

In [None]:
# Test model

import pickle
import json
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from os import getcwd

from utils.dataloader import DataLoader 
from settings. constants import VAL_CSV


with open('settings/specifications.json') as f:
    specifications = json.load(f)

x_columns = specifications['description']['X']
y_column = specifications['description']['y']

raw_val = pd.read_csv(VAL_CSV)
x_raw = raw_val[x_columns]

loader = DataLoader()
loader.fit(x_raw)
X = loader.load_data()
y = raw_val.stroke

loaded_model = pickle.load(open(getcwd() + '/models/RandForest.pickle', 'rb'))
loaded_model.score(X, y)

### API test

In [None]:
import json
import requests
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from utils import DataLoader, Estimator 
from settings. constants import TRAIN_CSV, VAL_CSV

with open('settings/specifications.json') as f:
    specifications = json.load(f)
    
info = specifications['description']
x_columns, y_column, metrics = info['X'], info['y'], info['metrics']

train_set = pd.read_csv(TRAIN_CSV, header=0)
val_set = pd.read_csv(VAL_CSV, header=0)

train_x, train_y = train_set[x_columns], train_set[y_column]
val_x, val_y = val_set[x_columns], val_set[y_column]

loader = DataLoader()
loader.fit(val_x)
val_processed = loader.load_data()
print('data: ', val_processed[:10])

req_data = {'data': json.dumps(val_x.to_dict())}

# To test localhost is used
response = requests.get('http://127.0.0.1:8000/predict', data=req_data)
api_predict = response.json()['prediction']
print('predict: ', api_predict[:10])

api_score = eval(metrics)(val_y, api_predict)
print('accuracy: ', api_score)

In [None]:
val_x.to_dict()

# Other dataset preproc

In [None]:
import numpy as np
import re as re
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df.columns

### BMI

In [None]:
# Good idea to split all the BMIs in 4 ranges:
# -underw
# -normal
# -overweight
# -obesity
# -nans - to fill with obesity

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df['bmi_rr'] = -1
df.loc[(18.5 >= df['bmi']), 'bmi_rr'] = 0
df.loc[(18.5 < df['bmi']) & (25 >= df['bmi']), 'bmi_rr'] = 1
df.loc[(25 < df['bmi']) & (30 >= df['bmi']), 'bmi_rr'] = 2
df.loc[(30 < df['bmi']), 'bmi_rr'] = 3
df.loc[df['bmi_rr'] == -1, 'bmi_rr'] = 3

y = df[["bmi_rr", "stroke"]].groupby(['bmi_rr'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

In [None]:
# Now let's treat BMI differently
# -nans - to fill with median
# It seems that it's a bad idea, since it makes people with overweight 
# to have bigger chances of stroke than obesed people 

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df.loc[df['bmi'].isna(), 'bmi'] = df['bmi'].median()
df['bmi_rr'] = -1
df.loc[(18.5 >= df['bmi']), 'bmi_rr'] = 0
df.loc[(18.5 < df['bmi']) & (25 >= df['bmi']), 'bmi_rr'] = 1
df.loc[(25 < df['bmi']) & (30 >= df['bmi']), 'bmi_rr'] = 2
df.loc[(30 < df['bmi']), 'bmi_rr'] = 3

y = df[["bmi_rr", "stroke"]].groupby(['bmi_rr'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

In [None]:
# Another way - to fill NANs in BMI by categories, for instance - gender and age
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df['bmi'] = df.groupby(['age', 'gender'])['bmi'].transform(lambda x: x.fillna(x.median()))

df['bmi_rr'] = -1
df.loc[(18.5 >= df['bmi']), 'bmi_rr'] = 0
df.loc[(18.5 < df['bmi']) & (25 >= df['bmi']), 'bmi_rr'] = 1
df.loc[(25 < df['bmi']) & (30 >= df['bmi']), 'bmi_rr'] = 2
df.loc[(30 < df['bmi']), 'bmi_rr'] = 3

y = df[["bmi_rr", "stroke"]].groupby(['bmi_rr'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

In [None]:
# Both genders tend to not fill BMI equally often

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df[df['bmi'].isna()].groupby('gender').size()

### Glucose level

In [None]:
# It seems that there is a threshold value that corresponded with higher level of stroke

n = int(2 * 5000 ** 0.3) + 1
n = 8

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

df['avg_glucose_level_r'] = -1.0
df['avg_glucose_level_r'] = pd.qcut(df['avg_glucose_level'], int(n))

y = df[["avg_glucose_level_r", "stroke"]].groupby(['avg_glucose_level_r'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

In [None]:
y

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

df['avg_glucose_level_r'] = -1
df.loc[df['avg_glucose_level'] < 150, 'avg_glucose_level_r'] = 0
df.loc[df['avg_glucose_level'] >= 150, 'avg_glucose_level_r'] = 1

y = df[["avg_glucose_level_r", "stroke"]].groupby(['avg_glucose_level_r'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Age

In [None]:
# There is a strong correlation between the age in decades and stroke chances 

import math

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

df['age_r'] = -1
df['age_r'] = df['age'] // 10
y = df[["age_r", "stroke"]].groupby(['age_r'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Hypertension 

In [None]:
# There is a strong correlation between the hypertension and stroke chances 

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["hypertension", "stroke"]].groupby(['hypertension'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Heart disease 

In [None]:
# There is a strong correlation between the heart_disease and stroke chances 

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["heart_disease", "stroke"]].groupby(['heart_disease'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Gender

In [None]:
# No correlation between stroke and gender

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["gender", "stroke"]].groupby(['gender'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Mariage

In [None]:
# No correlation between stroke and gender

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["ever_married", "stroke"]].groupby(['ever_married'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Smoking

In [None]:
# There is correlation for smokers and former smokers

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["smoking_status", "stroke"]].groupby(['smoking_status'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Residence type

In [None]:
# There is modest correlation for res type

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

y = df[["Residence_type", "stroke"]].groupby(['Residence_type'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

### Residence + glucose 

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

df['Residence_type'] = df['Residence_type'].replace({'Rural': 1, 'Urban': 2})

df['res_times_gluc'] = df['Residence_type'] * df['avg_glucose_level']
df[['res_times_gluc', 'stroke']].groupby('stroke').mean()

### hypertension + heart disease 

In [None]:
df['smoking_status']

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

df['ever_married'] = df['ever_married'].replace({'Yes': 1, 'No': 2})
df['smoking_status'] = df['smoking_status'].replace({'never smoked': 1, 'Unknown': 2, 'formerly smoked':3, 'smokes': 4})

df['age_r'] = -1
df['age_r'] = df['age'] // 10

df['hh'] = df['hypertension'] + df['heart_disease']
df[['hh', 'stroke']].groupby('hh').mean()

y = df[["hh", "stroke"]].groupby(['hh'], as_index = False).mean()
x = range(len(y.index))

plt.scatter(x, y.stroke.to_list())

# Final feature engineering 

In [None]:
import numpy as np
import re as re
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})
df.columns

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0, dtype={'Age': np.float64})

# Age in decades
df['age_in_decades'] = 0.1 * df['age']
df['age_in_decades'] = df['age_in_decades'].astype(int)

# BMI NANs
df.loc[df['bmi'].isna(), 'bmi'] = df['bmi'].median()

# BMI
df['bmi'] = pd.cut(
    df['bmi'],
    bins=[-float('inf'), 18.5, 25, 30, float('inf')],
    labels=[0, 1, 2, 3],
    right=True
).astype(int)

# Glucose level
df['avg_glucose_level'] = pd.qcut(df['avg_glucose_level'], 8)

# heart_disease_total
df['heart_disease_total'] = df['hypertension'] + df['heart_disease']

# Droping non necessary columns
drop_elements = ['id', 'age', 'Residence_type', 'hypertension', 'heart_disease', 'gender']
df = df.drop(drop_elements, axis=1)

df

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode labels
le = LabelEncoder()

le.fit(df['ever_married'])
df['ever_married'] = le.transform(df['ever_married'])

le.fit(df['work_type'])
df['work_type'] = le.transform(df['work_type'])

le.fit(df['smoking_status'])
df['smoking_status'] = le.transform(df['smoking_status'])

le.fit(df['avg_glucose_level'])
df['avg_glucose_level'] = le.transform(df['avg_glucose_level'])

df

### Some skidadle skidoodle

In [None]:
df['ever_married'] = df['ever_married'].replace({'Yes': 1, 'No': 2})
df['smoking_status'] = df['smoking_status'].replace({'never smoked': 1, 'Unknown': 2, 'formerly smoked':3, 'smokes': 4})
df['work_type'] = df['work_type'].replace({'Never_worked': 0, 'children': 1, 'Govt_job': 2,
                                                     'Private': 3, 'Self-employed': 4})

df['social_factors'] = df['ever_married'] + df['smoking_status'] + df['work_type']
df[['social_factors', 'stroke']].groupby('social_factors').mean()

y = df[["social_factors", "stroke"]].groupby(['social_factors'], as_index = False).mean()
x = range(len(y.index))

y = y.stroke.to_list()

plt.scatter(x, y)

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0)
df.columns

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv', header = 0)

train_df, val_df = train_test_split(
     df,
     test_size=0.2,
     stratify=df[["age"]],  
     random_state=42
)

train_df.to_csv(TRAIN_CSV, index=False)
val_df.to_csv(VAL_CSV, index=False)


In [None]:
for col in train_df.columns:
    print(train_df[col].unique())
    print(val_df[col].unique())
    print()
    