In [1]:
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from prep_transformer import PrepDataTransformer

In [2]:
pwd

'/mnt/d/DS_dev/WSL_DEV/try_docker/make_model'

In [3]:
data_file = '../data/heart_disease_uci.csv'
# data/heart_disease_uci.csv
df_raw = pd.read_csv(data_file)
df_raw.shape

(920, 16)

In [4]:
df_raw.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [5]:
def get_categories_transformer():

    cat_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
    one_hot = OneHotEncoder()

    _transformer = ColumnTransformer(
        [("one_hot", one_hot, cat_features)],
        remainder="passthrough"
    )

    return _transformer



In [6]:
prep_base_transformer = PrepDataTransformer()
prep_cat_transformer = get_categories_transformer()

In [7]:
prepdata_pipeline = Pipeline(
    [
        # Базовая обработка данных
        ("prep_base", prep_base_transformer),
        # обработка категориальных фичей
        ("prep_categories", prep_cat_transformer),
        # мин-макс обработка перед моделью
        ("prep_normalize", MinMaxScaler()),

        ("logreg_model", LogisticRegression()),
    ],
    verbose=True,
)

In [8]:
df_in = df_raw.drop(['id','dataset'], axis=1)
df_in.dtypes

age           int64
sex          object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object

In [9]:
df_in.isna().sum()

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

### Таргеты

In [10]:
X = df_in.drop("num", axis=1)
y = df_in["num"]
transformed_y = (y > 0)*1
X_train, X_test, y_train, y_test = train_test_split(X, transformed_y, test_size=0.2, random_state=42)


In [11]:
y_train.value_counts()

1    400
0    336
Name: num, dtype: int64

In [12]:
y_train.value_counts(normalize=True)

1    0.543478
0    0.456522
Name: num, dtype: float64

### модель logreg

In [13]:
X_train.shape

(736, 13)

In [14]:
model = prepdata_pipeline.fit(X_train, y_train)
model

[Pipeline] ......... (step 1 of 4) Processing prep_base, total=   0.0s
[Pipeline] ... (step 2 of 4) Processing prep_categories, total=   0.0s
[Pipeline] .... (step 3 of 4) Processing prep_normalize, total=   0.0s
[Pipeline] ...... (step 4 of 4) Processing logreg_model, total=   0.3s


In [15]:
tmp = df_raw.iloc[121].copy()
tmp

id                        122
age                        63
sex                    Female
dataset             Cleveland
cp               asymptomatic
trestbps                150.0
chol                    407.0
fbs                     False
restecg        lv hypertrophy
thalch                  154.0
exang                   False
oldpeak                   4.0
slope                    flat
ca                        3.0
thal        reversable defect
num                         4
Name: 121, dtype: object

In [16]:
# tmp['chol'] = None
# tmp['slope'] = None
model.predict_proba(tmp)[0][1]

0.9063614007217995

#### Метрики

In [17]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report
def gini(y_true, y_score):
    return 2*roc_auc_score(y_true, y_score) - 1

In [18]:
proba_train = model.predict_proba(X_train)[:,1]
proba_test = model.predict_proba(X_test)[:,1]

In [19]:
print(f1_score(y_train, (proba_train>0.4)*1))
print(f1_score(y_test, (proba_test>0.4)*1))

0.8551401869158879
0.8584474885844747


In [20]:
print(gini(y_train, proba_train))
print(gini(y_test, proba_test))

0.8036458333333334
0.799388379204893


#### Сохраним модельный пайплайн в файл

In [21]:
from joblib import dump as save_pickles
model_file = '../logreg_pipe.pkl'
save_pickles(prepdata_pipeline, model_file)

['../logreg_pipe.pkl']

In [22]:
x_test_file = 'x_test.pkl'
y_test_file = 'y_test.pkl'
save_pickles(X_test, x_test_file)
save_pickles(y_test, y_test_file)

['y_test.pkl']