In [22]:
import sys
sys.path.append('./')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from knn import KNN
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [23]:
def print_error_validation(accuracy, precission, roc_auc):
    print('Ошибки на кросс валидации:')
    print(f'Accuracy  = {accuracy}')
    print(f'Precision = {precision}')
    print(f'Roc auc   = {roc_auc}')
    
def retrain(model, x_train, x_test, y_train, y_test):
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_train, x_test)
    y_pred_train = model.predict(x_train, x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train)), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test))
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def sklearn_retrain(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    acc = accuracy_score(y_pred_train, y_train), accuracy_score(y_pred_test, y_test)
    
    prec = precision_score(y_pred_train, y_train, pos_label='positive', average='micro'), precision_score(y_pred_test, y_test, pos_label='positive', average='micro')
    
    roc = roc_auc_score(pd.get_dummies(y_train), pd.get_dummies(y_pred_train)), roc_auc_score(pd.get_dummies(y_test), pd.get_dummies(y_pred_test))
    
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def search_n_neighbors(x_train, y_train, x_test, y_test, n_neighbors_list):
    best_n = None
    best_roc = -10
    for n_n in n_neighbors_list:
        model = KNN(n_n, 5)
        accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
        if roc_auc > best_roc:
            best_n = n_n

In [24]:
def cross_val_score(model, x_train, y_train, x_test, y_test):
    acc_scores = []
    prec_scores = []
    roc_scores = []

    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_train = np.column_stack((x_train, y_train))
    
    sc = model.fit(x_train, y_train)
    
    print(f'CV scores: {sc}')
    mn = sum(sc) / float(len(sc))
    print(f'Train data accuracy: {mn}')
    
    y_pred = model.predict(x_train, x_test)
    
    acc_scores.append(accuracy_score(y_test, y_pred))
    prec_scores.append(precision_score(y_test, y_pred, pos_label='positive', average='micro'))
    roc_scores.append(roc_auc_score(pd.get_dummies(y_test), 
                                    pd.get_dummies(y_pred)))
    
    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score

In [25]:
mobile_data = pd.read_csv('newRain.csv')
X, Y = mobile_data.drop(['RainTomorrow'], axis=1), mobile_data['RainTomorrow']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [26]:
mobile_data.drop(['Unnamed: 0'], axis='columns', inplace=True)
mobile_data

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,Temp9am,RainToday,RISK_MM,RainTomorrow,year,quarter,month,day
0,17.5,32.3,1.0,0.0,41.0,11.0,7.0,20.0,82.0,33.0,7.0,8.0,17.8,0,0.2,0,2008,4,12,5
1,15.9,21.7,2.2,6.0,31.0,8.0,15.0,13.0,89.0,91.0,8.0,8.0,15.9,1,15.6,1,2008,4,12,12
2,15.9,18.6,15.6,0.0,61.0,4.0,28.0,28.0,76.0,93.0,8.0,8.0,17.4,1,3.6,1,2008,4,12,13
3,14.1,20.9,0.0,8.0,22.0,14.0,11.0,9.0,69.0,82.0,8.0,1.0,17.2,0,16.8,1,2008,4,12,17
4,13.5,22.9,16.8,0.0,63.0,1.0,6.0,20.0,80.0,65.0,8.0,1.0,18.0,1,10.6,1,2008,4,12,18
5,12.5,24.2,1.2,1.0,50.0,7.0,11.0,22.0,78.0,70.0,8.0,8.0,12.5,1,0.8,0,2008,4,12,30
6,24.4,34.0,0.6,11.0,98.0,4.0,26.0,48.0,60.0,39.0,4.0,1.0,26.1,0,6.4,1,2009,1,1,22
7,18.4,31.2,0.4,12.0,37.0,2.0,7.0,11.0,66.0,31.0,7.0,8.0,18.7,0,0.0,0,2009,1,2,9
8,16.1,21.6,0.0,12.0,46.0,3.0,11.0,15.0,58.0,69.0,2.0,8.0,17.0,0,3.0,1,2009,1,2,12
9,12.9,29.6,0.0,1.0,46.0,1.0,7.0,24.0,54.0,14.0,6.0,7.0,18.7,0,0.0,0,2009,1,2,24


In [27]:
n_neighbors = search_n_neighbors(x_train, y_train, x_test, y_test, np.arange(3,12,2))
print(f'Best n: {n_neighbors}')

CV scores: [0.8085714285714286, 0.7771428571428571, 0.8242857142857143, 0.8114285714285714, 0.8228571428571428]
Train data accuracy: 0.8088571428571429
CV scores: [0.7985714285714286, 0.8285714285714286, 0.8271428571428572, 0.8057142857142857, 0.8085714285714286]
Train data accuracy: 0.8137142857142857
CV scores: [0.8157142857142857, 0.7914285714285715, 0.8171428571428572, 0.8042857142857143, 0.8042857142857143]
Train data accuracy: 0.8065714285714286
CV scores: [0.8214285714285714, 0.8114285714285714, 0.79, 0.8185714285714286, 0.8157142857142857]
Train data accuracy: 0.8114285714285714
CV scores: [0.7914285714285715, 0.8085714285714286, 0.8042857142857143, 0.8071428571428572, 0.8071428571428572]
Train data accuracy: 0.8037142857142857
Best n: None


In [28]:
model = KNN(5)
accuracy, precision, roc_auc = cross_val_score(model, x_train.values, y_train.values, x_test.values, y_test.values)
print_error_validation(accuracy, precision, roc_auc)

CV scores: [0.8085714285714286, 0.8071428571428572, 0.7928571428571428, 0.8257142857142857, 0.82]
Train data accuracy: 0.8108571428571428
Ошибки на кросс валидации:
Accuracy  = 0.808
Precision = 0.808
Roc auc   = 0.6723575840768703


In [29]:
retrain(model, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.8697142857142857  |  0.808
Precision = 0.8697142857142857  |  0.808
Roc auc   = 0.7682552566057189  |  0.6723575840768703


In [30]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(x_train, y_train)
y_pred = neigh.predict(x_test)

print(f'Ошибка на выборках:\nAccuracy = {accuracy_score(y_test, y_pred)}')

Ошибка на выборках:
Accuracy = 0.8093333333333333


In [31]:
sklearn_retrain(neigh, x_train.values, x_test.values, y_train.values, y_test.values)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.8697142857142857  |  0.8093333333333333
Precision = 0.8697142857142857  |  0.8093333333333333
Roc auc   = 0.7682552566057189  |  0.6678391672386181
