In [1]:
import pandas as pd
import numpy as np
#PCA, splits and metrics
from sklearn import decomposition
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn import  metrics
#models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# data
train = pd.read_csv('../input/train.csv')
test  = pd.read_csv('../input/test.csv')
target = train["label"]
train = train.drop("label",1)

In [3]:
#decompose train data
pca = decomposition.PCA(n_components=50)
pca.fit(train)
transform_train = pca.transform(train)
transform_test = pca.transform(test)

In [4]:
class Quick_grid():
    
    def __init__(self, X, y, model, params):
        self.grid = GridSearchCV(model, params, verbose=1, cv=3, n_jobs=3)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.5, 
                                                                                random_state=5)
        self.trained_params = []
        self.train_pred = []
        self.output_pred = []
        
    def Train(self):
        self.grid.fit(self.X_train, self.y_train)
        self.trained_params = self.grid.best_estimator_
        print("Model trained")
        print(self.grid.best_estimator_)
        
    def Train_Preds(self):
        pred = self.grid.predict(self.X_test)
        print(metrics.classification_report(self.y_test, pred))
        self.train_pred = pred
    
    def Output_Preds(self, X):
        pred = self.grid.predict(X)
        print(metrics.classification_report(self.y_test, pred))
        self.output_pred = pred

In [5]:
class Stacker():
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.train_1 = pd.DataFrame()
        self.pred = []
        
    def Order_1(self, models):
        print("Training order 1...")
        for model in models:
            model.fit(self.X, self.y)
            self.train_1[str("model")] = model.predict(self.X)
            print("One model trained...")
        print("All models trained.")
    
    def Stack_pred(self, model, params):
        X_train, X_test, y_train, y_test = train_test_split(self.train_1, self.y, test_size=0.3, random_state=5)
        grid = GridSearchCV(model, params, verbose=3, cv=3, n_jobs=3)
        grid.fit(X_train, y_train)
        pred = grid.predict(X_test)
        self.pred = pred
        print(metrics.classification_report(y_test, pred))

In [6]:
params = {"n_neighbors": np.arange(1, 10, 2),
          "metric": ["euclidean"]}

first = Quick_grid(transform_train, target, model = KNeighborsClassifier(), params = params)

In [7]:
first.Train()

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Model trained
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')


[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:  2.4min finished


In [8]:
first.Train_Preds()

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2040
           1       0.97      0.99      0.98      2320
           2       0.97      0.96      0.97      2034
           3       0.96      0.96      0.96      2160
           4       0.97      0.96      0.97      2047
           5       0.97      0.96      0.96      1909
           6       0.97      0.99      0.98      2069
           7       0.96      0.98      0.97      2241
           8       0.98      0.93      0.96      1992
           9       0.95      0.95      0.95      2188

   micro avg       0.97      0.97      0.97     21000
   macro avg       0.97      0.97      0.97     21000
weighted avg       0.97      0.97      0.97     21000



In [9]:
#Random Forest

param_trees = {'n_estimators': [100],
               'max_features': [4, 5, 7, 10],
               'max_depth': [5, 10, 15, None], 
               'min_samples_leaf':[5, 10],
               'random_state': [0]}

second = Quick_grid(transform_train, target, model = RandomForestClassifier(), params = param_trees)

In [10]:
second.Train()

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done  96 out of  96 | elapsed:  5.5min finished


Model trained
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


In [11]:
second.Train_Preds()

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2040
           1       0.97      0.98      0.98      2320
           2       0.93      0.92      0.93      2034
           3       0.90      0.92      0.91      2160
           4       0.94      0.93      0.94      2047
           5       0.93      0.91      0.92      1909
           6       0.95      0.97      0.96      2069
           7       0.95      0.95      0.95      2241
           8       0.91      0.89      0.90      1992
           9       0.92      0.90      0.91      2188

   micro avg       0.94      0.94      0.94     21000
   macro avg       0.94      0.93      0.93     21000
weighted avg       0.94      0.94      0.94     21000



In [12]:
#Logistic Regression

param_lr = {'C': [0.001, 0.005, 0.01],
           'solver': ['lbfgs'],
           'multi_class': ['warn'],
           'max_iter': [500]}

third = Quick_grid(transform_train, target, model = LogisticRegression(), params = param_lr)

In [13]:
third.Train()

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   9 out of   9 | elapsed:   35.9s finished


Model trained
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)


In [14]:
third.Train_Preds()

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      2040
           1       0.94      0.98      0.96      2320
           2       0.89      0.86      0.88      2034
           3       0.87      0.87      0.87      2160
           4       0.90      0.91      0.91      2047
           5       0.85      0.80      0.82      1909
           6       0.92      0.95      0.93      2069
           7       0.92      0.93      0.93      2241
           8       0.86      0.84      0.85      1992
           9       0.88      0.86      0.87      2188

   micro avg       0.90      0.90      0.90     21000
   macro avg       0.90      0.90      0.90     21000
weighted avg       0.90      0.90      0.90     21000



In [15]:
stack = Stacker(transform_train, target)

In [16]:
stack_models = [first.trained_params ,second.trained_params, third.trained_params]


stack.Order_1(stack_models)

Training order 1...
One model trained...
One model trained...




One model trained...
All models trained.


In [17]:
param_stack = {'n_estimators': [100],
               'max_features': [None],
               'max_depth': [5, 10, 15, None], 
               'min_samples_leaf':[5, 10],
               'random_state': [0]}

stack.Stack_pred(model = RandomForestClassifier(), params = param_stack)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:    8.9s finished


              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1238
           1       0.94      0.97      0.96      1386
           2       0.89      0.87      0.88      1223
           3       0.87      0.88      0.87      1341
           4       0.90      0.91      0.90      1208
           5       0.85      0.80      0.83      1124
           6       0.93      0.95      0.94      1246
           7       0.93      0.93      0.93      1349
           8       0.87      0.86      0.86      1190
           9       0.88      0.86      0.87      1295

   micro avg       0.90      0.90      0.90     12600
   macro avg       0.90      0.90      0.90     12600
weighted avg       0.90      0.90      0.90     12600

