## Постановка задачи
Загрузим данные и разделим выборку на обучающую/проверочную в соотношении 80/20.

Применим метод ближайших соседей (kNN) для классификации скоринга. Будем использовать только биометрические данные.

Проверим качество предсказания через каппа-метрику и матрицу неточностей.

Данные:
* https://video.ittensive.com/machine-learning/prudential/train.csv.gz

Соревнование: https://www.kaggle.com/c/prudential-life-insurance-assessment/

© ITtensive, 2020

### Подключение библиотек

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

### Загрузка данных

In [2]:
data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB


### Разделение данных

In [8]:
column_data = ['Ins_Age', 'Ht', 'Wt', 'BMI','Response']
for i in range(1,10):
    column_data.append(f'Insurance_History_{i}')

for i in range(1,6):
    column_data.append(f'Family_Hist_{i}')

for i in range(1,42):
    column_data.append(f'Medical_History_{i}')

for i in range(1,8):
    column_data.append(f'InsurеdInfo_{i}')

print(column_data)

['Ins_Age', 'Ht', 'Wt', 'BMI', 'Response', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_6', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'Family_Hist_1', 'Family_Hist_2', 'Family_Hist_3', 'Family_Hist_4', 'Family_Hist_5', 'Medical_History_1', 'Medical_History_2', 'Medical_History_3', 'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_15', 'Medical_History_16', 'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_24', 'Medical_History_25', 'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 'Medical_History_30', 'Medical_Histo

In [9]:
data = pd.DataFrame(data=data, columns=column_data)

Unnamed: 0,Ins_Age,Ht,Wt,BMI,Response,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,...,Medical_History_39,Medical_History_40,Medical_History_41,InsurеdInfo_1,InsurеdInfo_2,InsurеdInfo_3,InsurеdInfo_4,InsurеdInfo_5,InsurеdInfo_6,InsurеdInfo_7
39868,0.298507,0.781818,0.278243,0.379321,8,2,1,3,1,0.000167,...,1,3,1,,,,,,,
38496,0.119403,0.618182,0.198745,0.391944,8,2,1,3,2,0.000313,...,3,3,1,,,,,,,
22601,0.208955,0.618182,0.288703,0.571939,5,2,1,1,3,,...,3,3,1,,,,,,,
49786,0.313433,0.727273,0.271967,0.418682,7,2,1,3,2,0.0016,...,3,3,1,,,,,,,
3145,0.447761,0.8,0.508368,0.707001,2,1,1,3,1,0.001,...,3,3,1,,,,,,,


Заполняем пропуски -1

In [11]:
data.fillna(value=-1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Data columns (total 67 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Ins_Age              59381 non-null  float64
 1   Ht                   59381 non-null  float64
 2   Wt                   59381 non-null  float64
 3   BMI                  59381 non-null  float64
 4   Response             59381 non-null  int64  
 5   Insurance_History_1  59381 non-null  int64  
 6   Insurance_History_2  59381 non-null  int64  
 7   Insurance_History_3  59381 non-null  int64  
 8   Insurance_History_4  59381 non-null  int64  
 9   Insurance_History_5  59381 non-null  float64
 10  Insurance_History_6  59381 non-null  float64
 11  Insurance_History_7  59381 non-null  int64  
 12  Insurance_History_8  59381 non-null  int64  
 13  Insurance_History_9  59381 non-null  int64  
 14  Family_Hist_1        59381 non-null  int64  
 15  Family_Hist_2        59381 non-null 

In [12]:
data_train, data_test = train_test_split(data, test_size=0.2)
data_train.head()

Unnamed: 0,Ins_Age,Ht,Wt,BMI,Response,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,...,Medical_History_39,Medical_History_40,Medical_History_41,InsurеdInfo_1,InsurеdInfo_2,InsurеdInfo_3,InsurеdInfo_4,InsurеdInfo_5,InsurеdInfo_6,InsurеdInfo_7
48300,0.552239,0.709091,0.257322,0.411314,7,2,1,3,1,0.000667,...,3,3,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
56994,0.522388,0.781818,0.414226,0.587636,6,1,1,3,1,0.001333,...,3,3,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
31310,0.567164,0.690909,0.267782,0.447648,8,2,1,3,1,0.001667,...,3,3,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
58363,0.626866,0.727273,0.263598,0.404724,8,1,3,3,1,0.001667,...,3,3,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
15244,0.373134,0.545455,0.1841,0.43893,8,2,1,1,3,-1.0,...,3,3,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Расчет модели kNN (k ближайших соседей)
Примените метод ближайших соседей (kNN) для классификации скоринга, используйте k=100. Используйте биометрические данные, все столбцы Insurance_History, Family_Hist, Medical_History и InsurеdInfo. Заполните отсутствующие значения -1.

Проведите предсказание и проверьте качество через каппа-метрику.

In [13]:
#  Создает объект классификатора KNN
knn100 = KNeighborsClassifier(n_neighbors=100)

In [15]:
y = data_train["Response"]
x = data_train.drop("Response", axis=1)

knn100.fit(x, y)

In [17]:
y_test = data_test['Response']
x_test = data_test.drop("Response", axis=1)

y_predict = knn100.predict(x_test)

y_predict

array([8, 8, 8, ..., 8, 2, 8])

### Оценка модели

In [18]:
print ("kNN, 100:",
      cohen_kappa_score(y_predict, data_test["Response"],
                       weights="quadratic"))

kNN, 100: 0.14712198263052279
