In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from mlxtend.classifier import StackingCVClassifier

In [2]:
#import data after preprocessing
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")
X_train = load_npz('./x_train_countvec_17029_mi.npz')
X_test = load_npz('./x_test_17029_mi.npz')
y_train = train_df['rating_label']

In [3]:
# split train and testing dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(y_val.shape)

(4613,)


In [4]:
def output_predictions(prediction):
    #output result
    output_df = pd.DataFrame({'rating_label': prediction})
    output_df.index += 1
    output_df.index.name = 'id'
    output_df.to_csv('./predictions.csv')
    return

In [5]:
def inverse_distance_weight(distances):
    return 1 / distances

In [6]:
from sklearn.metrics import accuracy_score

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    
    
clf1 = KNeighborsClassifier(n_neighbors=66,metric='manhattan', weights=inverse_distance_weight)
clf2 = RandomForestClassifier(random_state=67)
clf3 = GaussianNB()
clf4 = LogisticRegression()
    
params = {
    'max_depth': 50,
    'learning_rate': 0.5,
    'objective': 'multi:softmax',
    'eval_metric': 'error',
    'num_class': 3
}

classifiers = [clf1, clf2, clf3, clf4]
titles = ['KNN',
          'Random Foresr',
          'Gaussian NB',
          'Logistic Regression'
]



meta_classifier_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

meta_classifier_dt = DecisionTreeClassifier()
stacker_dt = StackingClassifier(classifiers, meta_classifier_dt)

meta_classifier_xgb = xgb.XGBClassifier(params)
stacker_xgb = StackingClassifier(classifiers, meta_classifier_xgb)

meta_classifier_rf = RandomForestClassifier(n_estimators=1000, random_state=67)
stacker_rf = StackingClassifier(classifiers, meta_classifier_rf)




In [None]:
sclf = StackingCVClassifier(classifiers, meta_classifier = clf4, random_state = 42)


In [44]:
stacker_dt.fit(X_train, y_train-3)
print('Stacker with lgr acc :', stacker_dt.score(X_val, y_val-3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return 1 / distances
  proba_k /= normalizer


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
stacker_dt.fit(X_train, y_train-3)
print('Stacker with dt acc :', stacker_xgb.score(X_val, y_val-3))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return 1 / distances
  proba_k /= normalizer


Stacker with xgb acc : 0.5913722089746369
