In [14]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from mlxtend.classifier import StackingCVClassifier
from sklearn import svm
from sklearn.dummy import DummyClassifier

In [15]:
#import data after preprocessing
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")
X_train = load_npz('./x_train_5000.npz')
X_test = load_npz('./x_test_5000.npz')
y_train = train_df['rating_label']

In [10]:
# split train and testing dataset
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(y_val.shape)

(4613,)


In [4]:
def output_predictions(prediction):
    #output result
    output_df = pd.DataFrame({'rating_label': prediction})
    output_df.index += 1
    output_df.index.name = 'id'
    output_df.to_csv('./predictions.csv')
    return

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
import warnings


warnings.simplefilter(action='ignore', category=UserWarning)

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    
    
    

param = {'learning_rate': 0.09, 
         'objective':'multi:softmax', 
         'subsample' : 0.8, 
         'colsample_bytree': 0.6, 
         'gamma': 0.5,
         'num_class': 3, 
         'max_depth': 10, 
         'min_child_weight': 1,
    
}

clf1 = KNeighborsClassifier(n_neighbors=80, metric='manhattan', weights='distance')
clf2 = RandomForestClassifier()
clf4 = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 5, 10,100], solver = 'lbfgs', multi_class='multinomial', n_jobs=-1, max_iter=900, class_weight={0:0.1, 1:0.9})
clf5 = xgb.XGBClassifier(param, n_estimators = 232)

classifiers = [BaggingClassifier(clf1, random_state=42), 
               BaggingClassifier(clf2, random_state=42),
               BaggingClassifier(clf4, random_state=42),
               BaggingClassifier(clf5, random_state=42)]
titles = ['KNN',
          'Random Forest',
          'Logistic Regression',
          'XGbooster'
]



meta_classifier_lr = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 5, 10,100], solver = 'lbfgs', multi_class='multinomial', n_jobs=-1, max_iter=900, class_weight={3:0.1, 4:0.8, 5:0.1})
stacker_lr = StackingClassifier(classifiers, meta_classifier_lr)

meta_classifier_knn = KNeighborsClassifier(n_neighbors=80, weights='distance')
stacker_knn = StackingClassifier(classifiers, metaclassifier=meta_classifier_knn)

meta_classifier_xgb = xgb.XGBClassifier(param, n_estimators = 232)
stacker_xgb = StackingClassifier(classifiers, meta_classifier_xgb)

meta_classifier_rf = RandomForestClassifier(n_estimators=1000, random_state=67)
stacker_rf = StackingClassifier(classifiers, meta_classifier_rf)

meta_classifier_svm = svm.SVC(kernel='rbf', gamma=0.7, C=1)
stacker_svm = StackingClassifier(classifiers, meta_classifier_svm)




In [12]:
#sclf = StackingCVClassifier(classifiers, meta_classifier = clf4, random_state = 42)


In [13]:
#stacker_dt.fit(X_train, y_train-3)
#print('Stacker with lgr acc :', stacker_dt.score(X_val, y_val-3))

In [17]:
dense_X = X_train.toarray()
dense_X_val = X_val.toarray()
dense_X_test = X_test.toarray()

In [10]:
stacker_knn.fit(dense_X, y_train-3)
print('Stacker with knn acc :', stacker_knn.score(dense_X_val, y_val-3))

Stacker with dt acc : 0.7066984608714503


In [13]:
stacker_xgb.fit(dense_X, y_train-3)
print('Stacker with xgb acc :', stacker_xgb.score(dense_X_val, y_val-3))

Stacker with xgb acc : 0.7106004769130717


In [19]:
stacker_lr.fit(dense_X, y_train-3)
print('Stacker with lr acc :', stacker_lr.score(dense_X_val, y_val-3))

In [13]:
prediction = stacker_lr.predict(dense_X_test)+3
output_predictions(prediction)
print (prediction)

[4. 4. 4. ... 4. 4. 4.]


In [None]:
stacker_rf.fit(dense_X, y_train-3)
print('Stacker with rf acc :', stacker_rf.score(dense_X_val, y_val-3))

Stacker with rf acc : 0.7184045089963148


In [15]:
stacker_svm.fit(dense_X, y_train-3)
print('Stacker with svm acc :', stacker_svm.score(dense_X_val, y_val-3))

Stacker with svm acc : 0.7158031649685671
