# Step 1 - TRAIN

In [6]:
import pandas as pd
import dill
import pickle
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('case.csv', sep=';')
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [11]:
df.shape

(70000, 13)

In [12]:
# обзор таргета
df['cardio'].value_counts()

cardio
0    35021
1    34979
Name: count, dtype: int64

In [14]:
#разделим данные на train/test/val
X_train, X_test, y_train, y_test = train_test_split(df.drop('cardio', axis=1), df['cardio'], random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=0)

In [16]:
# сохраним тестовые данные для реализации на стороне клиента
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39375 entries, 67161 to 26073
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           39375 non-null  int64  
 1   age          39375 non-null  int64  
 2   gender       39375 non-null  int64  
 3   height       39375 non-null  int64  
 4   weight       39375 non-null  float64
 5   ap_hi        39375 non-null  int64  
 6   ap_lo        39375 non-null  int64  
 7   cholesterol  39375 non-null  int64  
 8   gluc         39375 non-null  int64  
 9   smoke        39375 non-null  int64  
 10  alco         39375 non-null  int64  
 11  active       39375 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 3.9 MB


К полям:

- gender, cholesterol применим OHE-кодирование
- age, height, weight, ap_hi, ap_lo - standardScaler
- gluc, smoke, alco, active - оставим пока как есть

In [18]:
# соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

In [19]:
from sklearn.preprocessing import StandardScaler


continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
base_cols = ['gender', 'cholesterol','gluc', 'smoke', 'alco', 'active']

final_transformers = list()

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, transfomer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    final_transformers.append((base_col, base_transformer))

In [20]:
final_transformers

[('age',
  Pipeline(steps=[('selector', NumberSelector(key='age')),
                  ('standard', StandardScaler())])),
 ('height',
  Pipeline(steps=[('selector', NumberSelector(key='height')),
                  ('standard', StandardScaler())])),
 ('weight',
  Pipeline(steps=[('selector', NumberSelector(key='weight')),
                  ('standard', StandardScaler())])),
 ('ap_hi',
  Pipeline(steps=[('selector', NumberSelector(key='ap_hi')),
                  ('standard', StandardScaler())])),
 ('ap_lo',
  Pipeline(steps=[('selector', NumberSelector(key='ap_lo')),
                  ('standard', StandardScaler())])),
 ('gender', Pipeline(steps=[('selector', NumberSelector(key='gender'))])),
 ('cholesterol',
  Pipeline(steps=[('selector', NumberSelector(key='cholesterol'))])),
 ('gluc', Pipeline(steps=[('selector', NumberSelector(key='gluc'))])),
 ('smoke', Pipeline(steps=[('selector', NumberSelector(key='smoke'))])),
 ('alco', Pipeline(steps=[('selector', NumberSelector(key='alco'))]))

In [21]:
feats = FeatureUnion(final_transformers)
feats

In [22]:
classifier = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42)),
])

In [23]:
classifier

In [24]:
# запустим кросс-валидацию
cv_scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

CV score is 0.7838029106674276+-0.006145387747326793


In [25]:
#обучим пайплайн на всем тренировочном датасете
classifier.fit(X_train, y_train)

# сделаем предсказания на валидационном сете
preds = classifier.predict_proba(X_val)[:, 1]

In [26]:
# Посчитаем precision/recall/f_scoret/thresholds
precision, recall, thresholds = precision_recall_curve(y_val, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.390195, F-Score=0.737, Precision=0.655, Recall=0.842


Сохраним модель (пайплайн)

In [27]:
with open("model_pipeline.pkl", "wb") as f:
    pickle.dump(classifier, f)

Также сохраним список колонок для проверки входных данных

In [28]:
with open('columns.pkl', 'wb') as f:
    pickle.dump(X_train.columns.tolist(), f)

# Step 2 - PREDICT

### Проверка работоспособности и качества пайплайна

Здесь мы еще не запускаем никакое API, а загружаем модель (classifier) напрямую и проверяем на отложенной (тестовой) выборке

In [29]:
with open('model_pipeline.pkl', 'rb') as f:
    classifier = pickle.load(f)

In [30]:
classifier

In [31]:
preds = classifier.predict_proba(X_test)[:, 1]

preds[:10]

array([0.21999102, 0.25563081, 0.40698111, 0.17984251, 0.40174914,
       0.4693773 , 0.31894948, 0.83071607, 0.50040492, 0.71844099])

In [32]:
# Посчитаем precision/recall/f_scoret/thresholds
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.387443, F-Score=0.729, Precision=0.644, Recall=0.840


# Step 3 - FLASK

In [33]:
from flask import Flask, request, jsonify

### **Создаем сервис для обработки запросов к модели**

In [34]:
# Загружаем обученные модели
with open('model_pipeline.pkl', 'rb') as f:
    classifier = pickle.load(f)

In [35]:
# Загружаем список колонок
with open('columns.pkl', 'rb') as f:
    columns = pickle.load(f)
    
columns

['id',
 'age',
 'gender',
 'height',
 'weight',
 'ap_hi',
 'ap_lo',
 'cholesterol',
 'gluc',
 'smoke',
 'alco',
 'active']

In [36]:
# Обработчики и запуск Flask
app = Flask(__name__)

@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process!"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}
    
    # получаем JSON со стороны клиента
    request_json = request.get_json()
     
    # создаем словарь и наполняем его ключами(именами колонок) и соответсвующими значениями.
    # Если значение пропущено то записываем 0
    dict_columns = {}
    
    for column in columns:
        dict_columns[column] = 0
        
        if request_json[column]:
            dict_columns[column] = request_json[column]
                
    # создаем датафрейм из нашего словаря для классификатора
    df = pd.DataFrame.from_dict(dict_columns, orient='index').T
        
    # делаем предсказания и записываем в словарь data, и возращаем его. 
    preds = classifier.predict_proba(df)
    
    data["predictions"] = preds[:, 1][0]
    data["success"] = True

    print(data)
    
    return data


if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [28/Sep/2023 13:49:02] "POST //predict HTTP/1.1" 200 -


{'success': True, 'predictions': 0.2556308068181836}


127.0.0.1 - - [28/Sep/2023 13:49:05] "POST //predict HTTP/1.1" 200 -


{'success': True, 'predictions': 0.40698110911061375}


127.0.0.1 - - [28/Sep/2023 13:49:08] "POST //predict HTTP/1.1" 200 -


{'success': True, 'predictions': 0.1798425122400956}
