<a href="https://colab.research.google.com/github/VVdovichev/ML_in_Business/blob/main/HW_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [126]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_curve

In [127]:
df = pd.read_csv('train_case2.csv', sep=';')
df.drop(columns=['id'], axis=1, inplace=True)
df.head(30)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [128]:
class FeatureSelectorNumerical(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[[self.column]]


class FeatureSelectorCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]


class FeatureDeletion(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop(columns=[self.column])


class FeatureDropDuplicates(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.drop_duplicates()


class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [129]:
list_features_numerical = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
list_features_categorical = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

list_columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
                'cholesterol', 'gluc', 'smoke', 'alco', 'active']
target = 'cardio'

In [130]:
X_train, X_test, y_train, y_test = train_test_split(df[list_columns], df[target], test_size=0.3, random_state=42, stratify=df[target])

In [131]:
list_pipelines_numerical = []

for feature in list_features_numerical:
    pipeline_new = Pipeline([
        (f'fsn_{feature}', FeatureSelectorNumerical(column=feature)),
        (f'ss_{feature}', StandardScaler())
    ])
    
    list_pipelines_numerical.append((feature, pipeline_new))


list_pipelines_categorical = []

for feature in list_features_categorical:
    pipeline_new = Pipeline([
        (f'fsc_{feature}', FeatureSelectorCategorical(column=feature)),
        (f'ohe_{feature}', OHEEncoder(key=feature))
    ])
    
    list_pipelines_categorical.append((feature, pipeline_new))

##LogRess

In [132]:
pipeline_logress = Pipeline([
    ('feature_preprocessing', FeatureUnion(list_pipelines_numerical + list_pipelines_categorical, n_jobs=-1)),
    ('classifier', LogisticRegression(random_state=42))
])

In [133]:
%%time
pipeline_logress.fit(X_train, y_train)
y_pred_logress = pipeline_logress.predict_proba(X_test)[:, 1]

CPU times: user 651 ms, sys: 327 ms, total: 979 ms
Wall time: 986 ms


##RandomForestClassifier

In [134]:
pipeline_random_forest = Pipeline([
    ('feature_preprocessing', FeatureUnion(list_pipelines_numerical + list_pipelines_categorical, n_jobs=-1)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [135]:
%%time
pipeline_random_forest.fit(X_train, y_train)
y_pred_random_forest = pipeline_random_forest.predict(X_test)

CPU times: user 7.58 s, sys: 74.2 ms, total: 7.65 s
Wall time: 7.58 s


__2.__  при обучении моделей обязательно использовать кроссвалидацию

In [136]:
%%time
cross_val_logloss = cross_val_score(pipeline_logress, X_train, y_train, cv=5, scoring='roc_auc')

CPU times: user 2.83 s, sys: 1.87 s, total: 4.7 s
Wall time: 3.85 s


In [137]:
%%time
cross_val_random_forest = cross_val_score(pipeline_random_forest, X_train, y_train, cv=5, scoring='roc_auc')

CPU times: user 28.6 s, sys: 377 ms, total: 28.9 s
Wall time: 28.7 s


In [138]:
cross_val_logloss, cross_val_logloss.mean()

(array([0.77722582, 0.78173051, 0.78306227, 0.7768626 , 0.79783612]),
 0.7833434631441553)

In [139]:
cross_val_random_forest, cross_val_random_forest.mean()

(array([0.77211231, 0.77608996, 0.77453323, 0.77777905, 0.78476658]),
 0.7770562263017926)

__3.__ вывести сравнение полученных моделей по основным метрикам классификации: pr/rec/auc/f_score (можно в виде таблицы, где строки - модели, а столбцы - метрики)

In [140]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_pred_logress)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
ix = np.argmax(fscore)
print('F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

F-Score=0.731, Precision=0.657, Recall=0.824


In [141]:
columns = ['f-score', 'precision', 'recall']

metrics = pd.DataFrame(data=[[fscore[ix], precision[ix], recall[ix]]], columns=columns,
                       index=['LogisticRegression'])

metrics

Unnamed: 0,f-score,precision,recall
LogisticRegression,0.731264,0.657414,0.823804


In [142]:
b=1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_pred_random_forest)
fscore = (1+b**2)*(precision * recall) / (b**2*precision + recall)
ix = np.argmax(fscore)
print('F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

F-Score=0.708, Precision=0.718, Recall=0.699


In [143]:
metrics.loc['RandomForestClassifier', :] = [fscore[ix], precision[ix], recall[ix]]

metrics

Unnamed: 0,f-score,precision,recall
LogisticRegression,0.731264,0.657414,0.823804
RandomForestClassifier,0.708148,0.71757,0.698971


__4.__  сделать выводы о том, какая модель справилась с задачей лучше других

Без тюнинга моделей, считаю, обе модели отработали практически одинаково.