## Постановка задачи
Загрузите данные, приведите их к числовым, заполните пропуски, нормализуйте данные и оптимизируйте память.

Разделите выборку на обучающую/проверочную в соотношении 80/20.

Постройте 2 модели - kNN по 100 соседей и множественную логистическую регрессию - каждую по наиболее оптимальным наборам параметров (для каждой модели), используйте для этого перекрестную проверку GridSearchCV.

Проведите предсказание и проверьте качество через каппа-метрику.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

### Подключение библиотек

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

### Загрузка данных

In [2]:
data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")
print (data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


### Предобработка данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)
for l in data["Product_Info_2_1"].unique():
    data["Product_Info_2_1" + l] = data["Product_Info_2_1"].isin([l]).astype("int8")
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)
data.fillna(value=-1, inplace=True)

### Оптимизация памяти

In [4]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        else:
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [5]:
data = reduce_mem_usage(data)
print (data.info())

Потребление памяти меньше на 49.49 Мб (минус 84.9 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 133 entries, Id to Product_Info_2_1B
dtypes: float16(18), int16(1), int32(1), int8(113)
memory usage: 8.8 MB
None


### Общий набор столбцов для расчета

In [6]:
columns_groups = ["Insurance_History", "InsurеdInfo", "Medical_Keyword",
                  "Family_Hist", "Medical_History", "Product_Info"]
columns = ["Wt", "Ht", "Ins_Age", "BMI"]
for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print (columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keyword_31', 'Medical_Keyword_32', 'Medical_Keyword_33', 'Medical_Keyword_34', 'Medical_Keyword_35', 'Medical_Keyword_36', 'M

### Предобработка данных

In [7]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data,
                                                     columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

### Разделение данных
Преобразуем выборки в отдельные наборы данных

In [8]:
data_train, data_test = train_test_split(data_transformed,
                                         test_size=0.2)
data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)
print (data_train.head())

              0         1         2         3         4         5         6  \
46738 -1.218699 -1.444237 -1.073002 -0.728909 -1.634368 -0.169414  0.862391   
26427 -1.453140 -0.707398 -0.164865 -1.466045  0.611857 -0.169414 -1.159587   
33182 -0.866351 -0.957397 -0.391435 -0.515160  0.611857 -0.169414  0.862391   
56864 -0.278190 -0.220558  0.743891 -0.177556  0.611857 -0.169414 -1.159587   
34433 -0.371418  0.516280  1.045984 -0.720919 -1.634368 -0.169414  0.862391   

              7         8         9  ...       109       110       111  \
46738 -1.013721  0.864260 -0.928723  ... -0.083689  0.441621 -0.149284   
26427  1.101046 -1.156735  1.130555  ... -0.083689 -2.264385 -0.149284   
33182  0.043662  0.867354 -0.928723  ... -0.083689 -2.264385 -0.149284   
56864  1.101046 -1.156735  1.130555  ... -0.083689  0.441621 -0.149284   
34433 -1.013721  0.864260 -0.928723  ... -0.083689  0.441621 -0.149284   

            112       113       114       115       116       117  Response  
46

### KNN

In [9]:
knn_model = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
knn_grid = GridSearchCV(knn_model, {}, cv=5, scoring=make_scorer(cohen_kappa_score), n_jobs=-1)

In [10]:
knn_grid.fit(X=data_train.drop(columns=["Response"]), y=data_train["Response"])

GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_jobs=-1, n_neighbors=100),
             n_jobs=-1, param_grid={}, scoring=make_scorer(cohen_kappa_score))

In [26]:
y_test = knn_grid.predict(data_test.drop(columns=["Response"]))

In [27]:
round(cohen_kappa_score(data_test["Response"],
                y_test, weights="quadratic"), 3)

0.305

### Логистическая регрессия

In [21]:
lr_model = LogisticRegression(solver='liblinear', max_iter=1000)
paramert_grid = {'C': [0.001, 0.1, 10]}
lr_grid = GridSearchCV(lr_model, param_grid=paramert_grid, cv=5, n_jobs=-1, scoring=make_scorer(cohen_kappa_score))

In [22]:
%%time
lr_grid.fit(X=data_train.drop(columns=["Response"]), y=data_train["Response"])

CPU times: user 1min 4s, sys: 510 ms, total: 1min 5s
Wall time: 6min 16s


GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000, solver='liblinear'),
             n_jobs=-1, param_grid={'C': [0.001, 0.1, 10]},
             scoring=make_scorer(cohen_kappa_score))

In [23]:
lr_grid.best_params_

{'C': 10}

In [24]:
round(cohen_kappa_score(data_test["Response"],
                lr_grid.predict(data_test.drop(columns=["Response"])), weights="quadratic"), 3)

0.483

Кластеризация дает 0.192, kNN(100) - 0.3, простая лог. регрессия - 0.512