In [399]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder 

from sklearn.model_selection import train_test_split
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import ShuffleSplit, KFold, GroupKFold, StratifiedKFold
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import LabelEncoder

import torch
from torch import nn
import torch.nn.functional as F

pd.options.mode.chained_assignment = None  # default='warn'
import warnings
warnings.filterwarnings('ignore')

In [425]:
class Stacking:
    def __init__(self, data, target):
        self.df = data.copy()
        self.df_ans_train = pd.DataFrame()
        self.df_ans_test = pd.DataFrame()
        self.df_best_cls = {}
        self.scores = {}
        self.n_folds = 5
        self.scores_propor = 0
        
        sklearn_nb = naive_bayes.GaussianNB()
        sklearn_knn = KNeighborsClassifier(50)
        sklearn_lс = LogisticRegression()
        sklearn_tree = DecisionTreeClassifier()
        sklearn_svc = SVC(kernel='poly', C=1, max_iter=100, probability=True)

        self.clf_all = [sklearn_nb, sklearn_knn, sklearn_lс, sklearn_tree, sklearn_svc]
        
        target = 'G3'
        
        cols = self.df.columns
        labelencoder = LabelEncoder()

        num_cols = self.df._get_numeric_data().columns
        for i in list(set(cols) - set(num_cols)):
            if i != 'sex' and i != 'age' and i != 'address':
                self.df[i] = labelencoder.fit_transform(self.df[i].values)
        
        self.y = self.df[target]
        self.X = self.df.drop(target, axis = 1)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, shuffle=True, random_state=21)
        
        self.x_train = self.x_train.reset_index()
        self.x_train = self.x_train.drop(['index'], axis=1)
        self.y_train = self.y_train.reset_index()
        self.y_train = self.y_train.drop(['index'], axis=1)
        
        self.x_test = self.x_test.reset_index()
        self.x_test = self.x_test.drop(['index'], axis=1)
        self.y_test = self.y_test.reset_index()
        self.y_test = self.y_test.drop(['index'], axis=1)
        
        self.x_test = self.repair(self.x_test)
        
    def repair(self, x):  
        df_wrong, df_right = self.divide_df(df = x)
        #print(df_wrong.shape, df_right.shape)
        self.repair_df(KNeighborsClassifier(), df_wrong, df_right, ['sex', 'age', 'address'])
        x = pd.concat([df_wrong, df_right]).sort_index()
        return x
        
    def divide_df(self, df):
        dct = {'M': 1, 'F': 0, 'D': 2, 'C': 3, 'B': 4, 'A': 5}
        df['sex'] = df['sex'].map(dct)

        df_wrong = df[(df['age'] > 22) | (df['age'] < 15) | ((df['sex'] != 1) & (df['sex'] != 0)) | (df.isnull().any(1))]
        df_wrong['address'] = df_wrong['address'].astype('category').cat.codes

        df_right = df[~((df['age']>22) | (df['age'] < 15) | ((df['sex'] != 1) & (df['sex'] != 0)) | (df.isnull().any(1)))]
        df_right['address'] = df_right['address'].astype('category').cat.codes
        return df_wrong, df_right

    def divide_df_wrong(self, df_wrong, feat):
        if feat == 'sex':
            df_wrong_c = df_wrong[(df_wrong['sex'] != 0) & (df_wrong['sex'] != 1)]
        elif feat == 'age':
            df_wrong_c = df_wrong[(df_wrong['age'] > 22) | (df_wrong['age'] < 15) | (df_wrong['age'].isna())]
        elif feat == 'address':
            df_wrong_c = df_wrong[(df_wrong['address'] != 0) & (df_wrong['address'] != 1)]
        return df_wrong_c

    def repair_df(self, algorythm, df_wrong, df_right, feats):
        for feat in feats:
            X = df_right.drop([feat],axis = 1)
            y = df_right[feat]
            algorythm.fit(X, y)
            X_pred = self.divide_df_wrong(df_wrong, feat)
            if X_pred.shape[0] != 0:
                X_pred[feat] = algorythm.predict(X_pred[X.columns])
                df_wrong[feat].update(X_pred[feat])

    def stacking_alg(self):
        for clf in self.clf_all:
            clf_name = str(clf)[:-2]
            meta_feat, best_cls = self.cross_val_predict_st(clf)

            self.df_ans_train[clf_name] = meta_feat

            
            best_cls.fit(self.repair(self.x_train.copy()), self.y_train)
            self.df_best_cls[clf_name] = best_cls

            self.df_ans_test[clf_name] = self.df_best_cls[clf_name].predict(self.x_test)
            self.scores[clf_name] = accuracy_score(self.df_ans_test[clf_name], self.y_test)
            print(f'{str(clf)} score = {self.scores[clf_name]}','\n')
            
        #lr = LogisticRegression(solver='lbfgs', multi_class='ovr')
        #lr.fit(self.df_ans_train, self.y_train)
        #print(f'overall score = {accuracy_score(lr.predict(self.df_ans_test), self.y_test)}')
        
        model = nn.Sequential()
        model.add_module('l1', nn.Linear(self.df_ans_train.shape[1], len(self.y.unique())))
        # note: layer names must be unique
        model.add_module('l2', nn.Softmax())

        opt = torch.optim.Adam(model.parameters(), lr=1e-3)

        history = []
        for i in range(10000):
            ix = (self.df_ans_train.sample(frac = 0.85).index)
            x_batch = torch.tensor(self.df_ans_train.loc[ix].values, dtype=torch.float32)
            y_batch = torch.tensor(self.y_train.squeeze().loc[ix].values, dtype=torch.float32)

            y_predicted = model(x_batch)[:, 0]
            
            loss = F.cross_entropy(y_predicted, y_batch)

            loss.backward()
            opt.step()
            opt.zero_grad()

            history.append(loss.data.numpy())

            if i % 1000 == 0:
                print("step #%i | mean loss = %.3f" % (i, np.mean(history[-10:])))
        stacking_pred_proba = model(torch.tensor(self.df_ans_test.values, dtype=torch.float32))
        stacking_pred = np.array(stacking_pred_proba.detach().numpy(), dtype=np.int)
        print(stacking_pred_proba, stacking_pred_proba.shape, stacking_pred_proba.max())
        print(f'overall score = {accuracy_score(stacking_pred, self.y_test)}')
        
    def cross_val_predict_st(self, clf):
        kf = KFold(n_splits = self.n_folds)
        meta_feat = np.array([], dtype = 'int64')
        
        best_score = 1000000
        best_cls = clf
        
        for train_index, feach_index in kf.split(self.x_train):
            x_train, x_feach = self.x_train.loc[train_index], self.x_train.loc[feach_index]
            y_train, y_feach = self.y_train.loc[train_index], self.y_train.loc[feach_index]
            
            x_train = self.repair(x_train)
            x_feach = self.repair(x_feach)
            
            clf.fit(x_train,y_train)
            predict = clf.predict(x_feach)
            
            if accuracy_score(predict, y_feach) < best_score:
                best_cls = clf
            
            meta_feat = np.concatenate((meta_feat, predict))
        return meta_feat, best_cls
    
    def scor_prop(self):
        w_sum = sum(self.scores.values())
        self.scores_propor = {cls: w / w_sum  for cls, w in self.scores.items()}
        
    def bar_plot_sc_prop(self):
        self.scor_prop()
        sns.barplot(y=list(self.scores_propor.keys()), x = list(self.scores_propor.values()), palette="rocket")
        plt.xticks(rotation=0)
        plt.title('proportion of algorithms', fontsize=16);
        
    def correlation_of_clf(self):
        f = plt.figure(figsize=(10, 9))
        plt.matshow(self.df_ans_train.corr(), fignum=f.number)
        plt.xticks(range(self.df_ans_train.select_dtypes(['number']).shape[1]), self.df_ans_train.select_dtypes(['number']).columns, fontsize=14, rotation=90)
        plt.yticks(range(self.df_ans_train.select_dtypes(['number']).shape[1]), self.df_ans_train.select_dtypes(['number']).columns, fontsize=14)
        cb = plt.colorbar()
        cb.ax.tick_params(labelsize=14)
        plt.title('Correlation Matrix', fontsize=16);

        

In [426]:
data = pd.read_csv('train_features_with_answers.csv')

In [427]:
data_stack = Stacking(data, target = 'G3')

In [428]:
data_stack.stacking_alg() 

GaussianNB() score = 0.04395604395604396 

KNeighborsClassifier(n_neighbors=50) score = 0.12087912087912088 

LogisticRegression() score = 0.12087912087912088 

DecisionTreeClassifier() score = 0.06593406593406594 

SVC(C=1, kernel='poly', max_iter=100, probability=True) score = 0.17582417582417584 

step #0 | mean loss = 20972.564
step #1000 | mean loss = 21100.416
step #2000 | mean loss = 20984.031
step #3000 | mean loss = 21121.057
step #4000 | mean loss = 21099.844
step #5000 | mean loss = 21058.564
step #6000 | mean loss = 21052.830
step #7000 | mean loss = 21057.416
step #8000 | mean loss = 21043.656
step #9000 | mean loss = 21089.525
tensor([[1.0410e-21, 6.5797e-04, 8.8835e-03,  ..., 5.1099e-09, 5.2787e-01,
         9.8174e-17],
        [4.8663e-19, 9.8745e-04, 1.1941e-06,  ..., 3.6207e-07, 5.2630e-03,
         1.2987e-14],
        [2.2991e-21, 1.6283e-03, 5.7807e-03,  ..., 8.5858e-09, 6.1898e-01,
         1.8790e-16],
        ...,
        [2.1345e-18, 2.6299e-03, 1.2729e-02,  .

ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets

In [None]:
data_stack.y_train