In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, OneHotEncoder, PolynomialFeatures, SplineTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

from sklearn.feature_selection import SelectKBest, RFECV, SelectFromModel
from sklearn.feature_selection import f_classif, f_regression
from sklearn.decomposition import PCA
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.neural_network import MLPClassifier

from statistics import median, mean


from imblearn.over_sampling import SMOTE

In [2]:
train_df = pd.read_csv('./smoking_train.csv')
test_df = pd.read_csv('./smoking_test.csv')

target = 'smoking'  
train_df = train_df.rename(columns={target: 'target'})

In [3]:
train_df.head()

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,target
0,2,M,55.0,170,60.0,80.0,0.8,0.8,1.0,1.0,...,15.8,1.0,1.0,21.0,16.0,22.0,Y,0,N,yes
1,3,M,40.0,165,70.0,88.0,1.5,1.5,1.0,1.0,...,14.7,1.0,1.0,19.0,26.0,18.0,Y,0,Y,no
2,4,F,40.0,155,60.0,86.0,1.0,1.0,1.0,1.0,...,12.5,1.0,0.6,16.0,14.0,22.0,Y,0,N,no
3,5,M,30.0,180,75.0,85.0,1.2,1.2,1.0,1.0,...,16.2,1.0,1.2,18.0,27.0,33.0,Y,0,Y,no
4,6,M,40.0,160,60.0,85.5,1.0,1.0,1.0,1.0,...,17.0,1.0,0.7,21.0,27.0,39.0,Y,1,Y,yes


### Отчистка и обработка данных

In [4]:
# посмотрим на корреляции между фичами

train_df.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

  train_df.corr().style.background_gradient(cmap='coolwarm').set_precision(2)


Unnamed: 0,ID,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
ID,1.0,-0.0,0.01,0.0,0.0,0.01,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,0.01,-0.0,0.0,-0.01,-0.01,-0.0,0.0
age,-0.0,1.0,-0.48,-0.33,-0.03,-0.2,-0.18,0.2,0.21,0.13,0.05,0.18,0.06,0.02,0.01,0.04,-0.26,0.03,-0.11,0.03,-0.06,0.01,-0.12
height(cm),0.01,-0.48,1.0,0.68,0.38,0.15,0.15,-0.08,-0.08,0.08,0.11,0.01,-0.08,0.16,-0.21,-0.05,0.54,0.01,0.38,0.04,0.12,0.14,0.08
weight(kg),0.0,-0.33,0.68,1.0,0.82,0.11,0.11,-0.05,-0.06,0.26,0.27,0.13,0.02,0.32,-0.36,0.04,0.49,0.04,0.32,0.12,0.24,0.2,0.07
waist(cm),0.0,-0.03,0.38,0.82,1.0,0.02,0.04,0.02,0.02,0.32,0.29,0.21,0.06,0.36,-0.37,0.07,0.39,0.05,0.23,0.14,0.24,0.24,0.04
eyesight(left),0.01,-0.2,0.15,0.11,0.02,1.0,0.34,-0.05,-0.05,-0.02,0.0,-0.04,-0.01,0.02,-0.01,-0.01,0.09,0.01,0.07,-0.01,0.02,0.0,0.01
eyesight(right),0.0,-0.18,0.15,0.11,0.04,0.34,1.0,-0.04,-0.04,-0.01,0.01,-0.04,-0.01,0.02,-0.02,-0.01,0.09,-0.01,0.05,-0.01,0.02,0.01,0.02
hearing(left),0.0,0.2,-0.08,-0.05,0.02,-0.05,-0.04,1.0,0.51,0.05,0.01,0.04,-0.02,0.01,-0.02,-0.01,-0.03,0.01,0.01,0.02,0.01,0.01,-0.02
hearing(right),-0.0,0.21,-0.08,-0.06,0.02,-0.05,-0.04,0.51,1.0,0.05,-0.0,0.04,-0.02,0.0,-0.02,-0.02,-0.03,0.02,0.01,0.01,-0.0,0.01,-0.02
systolic,-0.0,0.13,0.08,0.26,0.32,-0.02,-0.01,0.05,0.05,1.0,0.76,0.17,0.06,0.2,-0.09,0.02,0.18,0.04,0.07,0.08,0.09,0.16,0.03


In [5]:
train_df.isna().sum(0)

ID                       0
gender                   0
age                    440
height(cm)               0
weight(kg)              44
waist(cm)                0
eyesight(left)           0
eyesight(right)          0
hearing(left)            0
hearing(right)           0
systolic                 0
relaxation               0
fasting blood sugar      0
Cholesterol              0
triglyceride             0
HDL                      0
LDL                      0
hemoglobin               0
Urine protein            0
serum creatinine         0
AST                      0
ALT                      0
Gtp                      0
oral                     0
dental caries            0
tartar                   0
target                   0
dtype: int64

In [23]:

median_age = median(train_df['age'].dropna())
median_age

40.0

In [24]:
median_weight = median(train_df['weight(kg)'].dropna())
median_weight

65.0

In [8]:

train_df['age'] = train_df['age'].fillna(median_age)
test_df['age'] = test_df['age'].fillna(median_age)

train_df['weight(kg)'] = train_df['weight(kg)'].fillna(median_weight)
test_df['weight(kg)'] = train_df['weight(kg)'].fillna(median_weight)

In [9]:
train_df.nunique() 

ID                     44554
gender                     2
age                       15
height(cm)                13
weight(kg)                23
waist(cm)                548
eyesight(left)            19
eyesight(right)           17
hearing(left)              2
hearing(right)             2
systolic                 127
relaxation                94
fasting blood sugar      270
Cholesterol              278
triglyceride             388
HDL                      124
LDL                      277
hemoglobin               142
Urine protein              6
serum creatinine          37
AST                      207
ALT                      240
Gtp                      464
oral                       1
dental caries              2
tartar                     2
target                     2
dtype: int64

In [10]:
pd.concat([train_df,test_df]).nunique() - train_df.nunique()

ID                     11138
gender                     0
age                        0
height(cm)                 0
weight(kg)                 0
waist(cm)                 18
eyesight(left)             0
eyesight(right)            0
hearing(left)              0
hearing(right)             0
systolic                   3
relaxation                 1
fasting blood sugar        6
Cholesterol                8
triglyceride               2
HDL                        2
LDL                       12
hemoglobin                 3
Urine protein              0
serum creatinine           1
AST                       12
ALT                        5
Gtp                       24
oral                       0
dental caries              0
tartar                     0
target                     0
dtype: int64

In [11]:
class ColumnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 

In [12]:
cat_cols = [x for x in train_df.nunique()[train_df.nunique() < 25].index.to_list() if x != 'target']
num_cols = [column for column in train_df.columns if column not in cat_cols + ['target']]

cat_steps = [
    ('onehot', OneHotEncoder(handle_unknown='ignore',)),
]
num_steps = [
    ('dropper', ColumnDropperTransformer(['ID'])),
    ('scaler1', StandardScaler()),
    ('pwr', PowerTransformer()),
    ('poly', PolynomialFeatures(3, include_bias=False)),
    ('scaler', StandardScaler()),
]

cat_transformer = Pipeline(cat_steps)
num_transformer = Pipeline(num_steps)
preprocessor = ColumnTransformer(
    [
        ('num',  num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)


model = LogisticRegression(max_iter= 2000, random_state=42)


pipe = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

In [13]:
enc = LabelEncoder()

X = train_df.drop('target', axis=1)
y = enc.fit_transform(train_df['target'])

In [22]:
X.head()

cv_oupt = cross_validate(pipe, X, y, cv=5, scoring='f1', return_estimator=True)
scores = cv_oupt['test_score']
models = cv_oupt['estimator']

In [15]:
scores.mean(), scores.std() 

(0.6995487383388944, 0.03910482112434293)

In [16]:
preds = []
for model in models:
    pred = model.predict_proba(test_df)
    preds.append(pred)

In [17]:
preds = np.array(preds)

In [18]:
subm_preds = enc.inverse_transform(preds.mean(axis=0).argmax(axis=1))

In [19]:
subm_df = pd.DataFrame({'ID': test_df.ID, 'smoking': subm_preds})

In [20]:
subm_df

Unnamed: 0,ID,smoking
0,48715,yes
1,49650,yes
2,18013,no
3,24282,no
4,9215,no
...,...,...
11133,20787,yes
11134,5000,no
11135,36617,no
11136,38229,yes


In [21]:
subm_df.to_csv('subm.csv', index=False)