In [4]:
import sys
sys.path.append('./')
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd

from randForest import RandomForest

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score

In [12]:
def cross_val_score(model, X,y):
    acc_scores = []
    prec_scores = []
    roc_scores = []
    
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test,y_pred)
        prec_score = precision_score(y_test,y_pred)
        roc_score = roc_auc_score(y_test,y_pred)
        acc_scores.append(acc_score)
        prec_scores.append(prec_score)
        roc_scores.append(roc_score)

    score = np.mean(acc_scores), np.mean(prec_scores), np.mean(roc_scores)
    return score
    
def check_retrain(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    acc = accuracy_score(y_pred_train,y_train), accuracy_score(y_pred_test,y_test)
    prec = precision_score(y_pred_train,y_train), precision_score(y_pred_test,y_test)
    roc= roc_auc_score(y_pred_train,y_train),roc_auc_score(y_pred_test,y_test)
    print('Ошибки на выборках')
    print('            Train                  Test')
    print(f'Accuracy  = {acc[0]}  |  {acc[1]}')
    print(f'Precision = {prec[0]}  |  {prec[1]}')
    print(f'Roc auc   = {roc[0]}  |  {roc[1]}')
    
def print_error_validation(accuracy,precission,roc_auc):
    print('Ошибки на валидации')
    print(f'accuracy = {accuracy}')
    print(f'precision = {precision}')
    print(f'roc auc = {roc_auc}')

def search_parametrs(X,y,n_estimators_list,max_depth_list, min_size_list):
    best_max_depth = None
    best_min_size = None
    best_n_estimators = None
    best_roc = -10
    for n_est in n_estimators_list:
        for max_depth in max_depth_list:
            for min_size in min_size_list:
                model = RandomForest(n_estimators=n_est,max_depth=max_depth,min_size=min_size)
                accuracy,precision,roc_auc = cross_val_score(model,X,y)
                if roc_auc > best_roc:
                    best_n_estimators, best_max_depth, best_min_size = n_est, max_depth,min_size
    return best_n_estimators, best_max_depth, best_min_size

In [9]:
mobile_data = pd.read_csv('newRain1.csv')
mobile_data.drop(['Unnamed: 0'], axis='columns', inplace=True)
X, Y = mobile_data.drop(['RainTomorrow'], axis=1).values, mobile_data['RainTomorrow'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [10]:
n_estimators, max_depth,min_size = search_parametrs(X,Y,n_estimators_list=np.arange(10,60,10),
                                                    max_depth_list=np.arange(6,10,1), min_size_list=np.arange(4,8,1))

In [11]:
n_estimators, max_depth, min_size

(50, 9, 7)

In [13]:
clf =  RandomForest(n_estimators=n_estimators,max_depth=max_depth,min_size=min_size)
accuracy,precision,roc_auc = cross_val_score(clf,X,Y)
print_error_validation(accuracy,precision,roc_auc)

Ошибки на валидации
accuracy = 0.61
precision = 0.6873669467787116
roc auc = 0.5906870906870907


In [14]:
check_retrain(clf,x_train,x_test, y_train, y_test)

Ошибки на выборках
            Train                  Test
Accuracy  = 0.6285714285714286  |  0.6666666666666666
Precision = 0.9069767441860465  |  0.9444444444444444
Roc auc   = 0.5974499089253188  |  0.701923076923077


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
scoring = ['precision_macro', 'roc_auc','accuracy']
scoring_test = ['test_accuracy','test_precision_macro','test_roc_auc']

In [17]:
clf = RandomForestClassifier()
parameters = {'n_estimators':np.arange(10,60,10),'max_depth':np.arange(6,10,1), 'min_samples_leaf':np.arange(4,8,1)}
clf_cv = GridSearchCV(clf, parameters)
clf_cv.fit(X,Y)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [18]:
clf_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
n_estimators, max_depth, min_size = 40, 6, 4

In [22]:
clf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, min_samples_leaf=min_size)
scores = cross_validate(clf,X,Y,cv=5,scoring=scoring)

for score in scoring_test:
    print(f'{score} = {np.mean(scores[score])}')

test_accuracy = 1.0
test_precision_macro = 1.0
test_roc_auc = 1.0


In [24]:
check_retrain(clf,x_train,x_test, y_train, y_test)


Ошибки на выборках
            Train                  Test
Accuracy  = 1.0  |  1.0
Precision = 1.0  |  1.0
Roc auc   = 1.0  |  1.0
