In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from mlxtend.classifier import StackingCVClassifier
from sklearn import svm
from sklearn.dummy import DummyClassifier
import feature_select
import feture_process
import pickle
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
import scipy

In [2]:
#import data after preprocessing
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")
X_train = load_npz('./x_train_1000.npz')
X_test = load_npz('./x_test_1000.npz')
y_train = train_df['rating_label']

In [3]:
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")
# process Prublisher and Language
train_df["Language"], test_df["Language"] = feture_process.docclass_preprocess(train_df["Language"],test_df["Language"],10)
train_df["Publisher"], test_df["Publisher"] = feture_process.docclass_preprocess(train_df["Publisher"],test_df["Publisher"],200)
# use oneHotCode for Publisher (sklearn)
publisher_train_hot = feture_process.process_OneHotEncoder(train_df,"Publisher")
publisher_test_hot = feture_process.process_OneHotEncoder(test_df,"Publisher")
train_df = pd.concat([train_df, publisher_train_hot], axis=1)
test_df = pd.concat([test_df, publisher_test_hot], axis=1)
# use oneHotCode for Language (sklearn)
language_train_hot = feture_process.process_OneHotEncoder(train_df,"Language")
language_test_hot = feture_process.process_OneHotEncoder(test_df,"Language")
train_df = pd.concat([train_df, language_train_hot], axis=1)
test_df = pd.concat([test_df, language_test_hot], axis=1)
# train_name_countvectorizer
train_name_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
train_name_dic = train_name_countvectorizer.vocabulary_

# train_authors_countvectorizer
train_authors_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))
train_authors_dic = train_authors_countvectorizer.vocabulary_

# train_desc_countvectorizer
train_desc_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
train_desc__dic = train_desc_countvectorizer.vocabulary_

# process vector features
train_name_features = train_name_countvectorizer.transform(train_df['Name'])
train_authors_features = train_authors_countvectorizer.transform(train_df['Authors'])
train_desc_features = train_desc_countvectorizer.transform(train_df['Description'])
other_features_df = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])
# new sparse features
sparse_features = hstack([train_name_features, train_authors_features, train_desc_features])
# new train features
dense_features = csr_matrix(other_features_df.values)
train_features = hstack([sparse_features, dense_features])


# process test features
test_name_features = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_name_vec.npz')
test_authors_features = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_authors_vec.npz')
test_desc_features = scipy.sparse.load_npz('./project_data_files/book_text_features_countvec/test_desc_vec.npz')
test_other_features_df = test_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language'])

test_sparse_features = hstack([test_name_features, test_authors_features, test_desc_features])
new_dense_features = csr_matrix(test_other_features_df.values)
test_features = hstack([test_sparse_features, new_dense_features])


In [22]:
selected_features,selected_features_test = feature_select.MI(train_df,train_features,test_features,12000)

In [4]:
selected_features,selected_features_test = train_features,test_features

In [5]:
# split train and testing dataset
X_train, X_val, y_train, y_val = train_test_split(selected_features, train_df["rating_label"], test_size=0.2, random_state=42)
print(X_val.shape)

(4613, 126900)


In [6]:
def output_predictions(prediction):
    #output result
    output_df = pd.DataFrame({'rating_label': prediction})
    output_df.index += 1
    output_df.index.name = 'id'
    output_df.to_csv('./predictions.csv')
    return

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
import warnings


warnings.simplefilter(action='ignore', category=UserWarning)

np.random.seed(1)

class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)
    
    
    

param = {'learning_rate': 0.09, 
         'objective':'multi:softmax', 
         'subsample' : 0.8, 
         'colsample_bytree': 0.6, 
         'gamma': 0.5,
         'num_class': 3, 
         'max_depth': 10, 
         'min_child_weight': 1,
    
}

clf1 = KNeighborsClassifier(n_neighbors=80, metric='manhattan', weights='distance')
clf2 = RandomForestClassifier()
clf3 = DummyClassifier(strategy='most_frequent')
clf4 = LogisticRegression(solver = 'lbfgs', multi_class='multinomial', max_iter=400, class_weight={0:0.1, 1:0.9, 2:0})
clf5 = xgb.XGBClassifier(param, n_estimators = 232)
clf6 = xgb.XGBClassifier(learning_rate = 0.05)
clf7 = RandomForestClassifier(n_estimators=200, criterion='entropy')
clf8 = KNeighborsClassifier(n_neighbors=75, metric='manhattan', weights='distance')
clf9 = LogisticRegression(solver = 'saga', multi_class='multinomial', max_iter=400, class_weight={0:0.1, 1:0.9, 2:0})
clf10 = BernoulliNB()
clf11 = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, 
                    batch_size='auto', learning_rate='constant', learning_rate_init=0.001, 
                    shuffle=True, random_state=None)

classifiers = [clf1, clf2, clf4, clf5, clf6, clf7, clf8, clf9, clf11]




meta_classifier_lr = LogisticRegression(solver = 'lbfgs', multi_class='multinomial', n_jobs=-1, max_iter=400, class_weight={0:0.1, 1:0.9, 2:0})
stacker_lr = StackingClassifier(classifiers, metaclassifier=BaggingClassifier(meta_classifier_lr))

meta_classifier_knn = KNeighborsClassifier(n_neighbors=80, weights='distance')
stacker_knn = StackingClassifier(classifiers, BaggingClassifier(base_estimator = meta_classifier_knn))

meta_classifier_xgb = xgb.XGBClassifier(param, n_estimators = 232)
stacker_xgb = StackingClassifier(classifiers, metaclassifier=BaggingClassifier(meta_classifier_xgb))

meta_classifier_rf = RandomForestClassifier(n_estimators=1000, random_state=67)
stacker_rf = StackingClassifier(classifiers, BaggingClassifier(meta_classifier_rf, n_estimators=30, max_features=0.9, bootstrap=False))

meta_classifier_svm = svm.SVC(kernel='rbf', gamma=0.7, C=1)
stacker_svm = StackingClassifier(classifiers, meta_classifier_svm)

meta_classifier_mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, 
                    batch_size='auto', learning_rate='constant', learning_rate_init=0.001, 
                     shuffle=True, random_state=None)
stacker_mlp = StackingClassifier(classifiers, meta_classifier_mlp)




In [7]:
#sclf = StackingCVClassifier(classifiers, meta_classifier = clf4, random_state = 42)


In [13]:
#stacker_dt.fit(X_train, y_train-3)
#print('Stacker with lgr acc :', stacker_dt.score(X_val, y_val-3))

In [8]:
dense_X = X_train.toarray()
dense_X_val = X_val.toarray()
dense_X_test = X_test.toarray()

In [9]:
stacker_mlp.fit(dense_X, y_train-3)

In [10]:
stacker_mlp.score(dense_X_val, y_val-3)

0.7309776717970952

In [11]:
prediction = stacker_mlp.predict(selected_features_test)+3
output_predictions(prediction)

In [26]:
stacker_knn.fit(dense_X, y_train-3)
print('Stacker with knn acc :', stacker_knn.score(dense_X_val, y_val-3))

Stacker with knn acc : 0.7212226316930413


In [31]:
stacker_xgb.fit(dense_X, y_train-3)
print('Stacker with xgb acc :', stacker_xgb.score(dense_X_val, y_val-3))

Stacker with xgb acc : 0.7205722956861045


In [32]:
stacker_lr.fit(dense_X, y_train-3)
print('Stacker with lr acc :', stacker_lr.score(dense_X_val, y_val-3))

Stacker with lr acc : 0.7138521569477564


In [12]:
prediction = stacker_xgb.predict(dense_X_test)+3
output_predictions(prediction)
print (prediction)

  proba /= len(self.estimators_)


[4. 4. 4. ... 4. 4. 4.]


In [29]:
stacker_rf.fit(dense_X, y_train-3)
print('Stacker with rf acc :', stacker_rf.score(dense_X_val, y_val-3))

Stacker with rf acc : 0.7236071970518101


In [15]:
stacker_svm.fit(dense_X, y_train-3)
print('Stacker with svm acc :', stacker_svm.score(dense_X_val, y_val-3))

Stacker with svm acc : 0.7158031649685671
