In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
import sklearn


# Get DataSet

### Preprocessing

In [5]:
raw_path = '/Users/alexgiving/Documents/PythonProjects/class_ml_lab1/data/raw/'
train = pd.read_csv(os.path.join(raw_path, 'train.csv'))
test = pd.read_csv(os.path.join(raw_path, 'test.csv'))

In [6]:
TARGET_COLS = ['Артериальная гипертензия', 'ОНМК', 'Стенокардия, ИБС, инфаркт миокарда', 'Сердечная недостаточность', 'Прочие заболевания сердца']
ID_COL = 'ID'
EDU_COL = 'Образование'
SEX_COL = 'Пол'
CAT_COLS = [
    'Пол', 'Семья', 'Этнос', 'Национальность', 'Религия', 'Образование', 
    'Профессия', 'Статус Курения', 'Частота пасс кур', 'Алкоголь',
    'Время засыпания', 'Время пробуждения'
]
OHE_COLS = [
    'Пол', 'Вы работаете?', 'Выход на пенсию', 'Прекращение работы по болезни', 'Сахарный диабет', 'Гепатит',
    'Онкология', 'Хроническое заболевание легких', 'Бронжиальная астма', 'Туберкулез легких ', 'ВИЧ/СПИД',
    'Регулярный прим лекарственных средств', 'Травмы за год', 'Переломы','Пассивное курение', 'Сон после обеда', 
    'Спорт, клубы', 'Религия, клубы'
]
REAL_COLS = ['Возраст курения', 'Сигарет в день', 'Возраст алког']

In [7]:
def set_idx(df: pd.DataFrame, idx_col: str) -> pd.DataFrame:
    df = df.set_index(idx_col)
    return df

In [8]:
train = train.set_index(ID_COL)
test = test.set_index(ID_COL)

In [9]:
train, target = train.drop(TARGET_COLS, axis=1), train[TARGET_COLS]

In [10]:
train.describe()

Unnamed: 0,Вы работаете?,Выход на пенсию,Прекращение работы по болезни,Сахарный диабет,Гепатит,Онкология,Хроническое заболевание легких,Бронжиальная астма,Туберкулез легких,ВИЧ/СПИД,Регулярный прим лекарственных средств,Травмы за год,Переломы,Возраст курения,Сигарет в день,Пассивное курение,Возраст алког,Сон после обеда,"Спорт, клубы","Религия, клубы"
count,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,955.0,412.0,409.0,955.0,788.0,955.0,955.0,955.0
mean,0.52356,0.335079,0.048168,0.105759,0.120419,0.045026,0.075393,0.042932,0.020942,0.002094,0.642932,0.053403,0.353927,20.300971,14.220049,0.234555,20.038071,0.226178,0.068063,0.023037
std,0.499706,0.472265,0.214232,0.30769,0.325621,0.20747,0.264162,0.20281,0.143267,0.045739,0.479386,0.224954,0.478437,8.039846,10.15471,0.423942,5.177517,0.418575,0.251986,0.150098
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,6.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,6.0,0.0,17.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,18.0,10.0,0.0,19.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,21.0,20.0,0.0,20.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,58.0,60.0,1.0,63.0,1.0,1.0,1.0


In [11]:
def drop_unnecesary_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'ID_y' in df.columns:
        df = df.drop('ID_y', axis=1)
    return df

In [12]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, ID_COL)
    df = drop_unnecesary_id(df)
    df = fill_sex(df)
    df = cast_types(df)
    return df

In [16]:
train = drop_unnecesary_id(train)

In [28]:
def add_ord_edu(df: pd.DataFrame) -> pd.DataFrame:
    df[f'{EDU_COL}_ord'] = df[EDU_COL].str.slice(0, 1).astype(np.int8).values
    return df

In [29]:
train = add_ord_edu(train)
test = add_ord_edu(test)

In [63]:
def fill_sex(df: pd.DataFrame) -> pd.DataFrame:
    most_freq = df[SEX_COL].value_counts().index[0]
    df[SEX_COL] = df[SEX_COL].fillna(most_freq)
    return df

In [70]:
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[CAT_COLS] = df[CAT_COLS].astype('category')

    ohe_int_cols = train[OHE_COLS].select_dtypes('number').columns
    df[ohe_int_cols] = df[ohe_int_cols].astype(np.int8)

    df[REAL_COLS] = df[REAL_COLS].astype(np.float32)
    return df
    

# MODELING

In [5]:
import os
import pandas as pd
import numpy as np

In [6]:
from sklearn.svm import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.multioutput import *

In [3]:
import src.config as cfg

ModuleNotFoundError: No module named 'src'

In [8]:
RS = 77

In [9]:
processed_data_path = 'data/processed/'
train = pd.read_pickle(os.path.join(processed_data_path, 'train.pkl'))
target = pd.read_pickle(os.path.join(processed_data_path, 'target.pkl'))

In [10]:
train[cfg.CAT_COLS] = train[cfg.CAT_COLS].astype('object')

In [11]:
# int8_cols = train.head(0).select_dtypes('int8').columns
# train[int8_cols] = train[int8_cols].astype(np.int32)
# target = target.astype(np.int32)

In [12]:
from functools import partial

In [13]:
scoring = partial(fbeta_score, beta=2.0)

In [14]:
train_data, val_data, train_target, val_target = train_test_split(train, target, train_size=0.8, random_state=RS)

In [15]:
train_data.head()

Unnamed: 0_level_0,Пол,Семья,Этнос,Национальность,Религия,Образование,Профессия,Вы работаете?,Выход на пенсию,Прекращение работы по болезни,...,Сигарет в день,Пассивное курение,Частота пасс кур,Алкоголь,Возраст алког,Время засыпания,Время пробуждения,Сон после обеда,"Спорт, клубы","Религия, клубы"
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54-103-054-01,Ж,в браке в настоящее время,европейская,Русские,Христианство,5 - ВУЗ,дипломированные специалисты,1,0,0,...,,0,,употребляю в настоящее время,26.0,22:00:00,06:00:00,0,1,0
54-102-299-01,Ж,вдовец / вдова,европейская,Русские,Христианство,5 - ВУЗ,дипломированные специалисты,0,1,0,...,,0,,никогда не употреблял,,22:30:00,08:30:00,0,1,0
54-102-138-01,Ж,никогда не был(а) в браке,европейская,Русские,Христианство,5 - ВУЗ,низкоквалифицированные работники,1,0,0,...,,0,,никогда не употреблял,,22:00:00,06:30:00,0,0,0
54-601-033-01,Ж,в браке в настоящее время,европейская,Русские,Христианство,4 - профессиональное училище,операторы и монтажники установок и машинного о...,0,1,0,...,,0,,употребляю в настоящее время,20.0,22:00:00,07:00:00,0,0,0
54-602-054-01,Ж,в разводе,европейская,Русские,Христианство,4 - профессиональное училище,ведение домашнего хозяйства,0,1,0,...,10.0,0,,употребляю в настоящее время,18.0,00:00:00,07:00:00,1,0,0


In [16]:
real_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
]
)

In [17]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [18]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, cfg.REAL_COLS),
    ('cat_cols', cat_pipe, cfg.CAT_COLS),
    ('ohe_cols', 'passthrough', cfg.OHE_COLS)
]
)

In [19]:
model = LinearSVC()

In [20]:
model_pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', model)
]
)

In [21]:
multiout_model_pipe = MultiOutputClassifier(model_pipe, n_jobs=4)

In [24]:
scores = cross_val_score(
    estimator=multiout_model_pipe,
    X=train_data,
    y=train_target,
    scoring='f1_samples',
    cv=3,
    n_jobs=1
)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [25]:
scores

array([0.24287582, 0.21764706, 0.23425197])