In [38]:
# import library
import sklearn
import numpy
import pandas as pd
import pickle
from scipy.sparse import hstack
from scipy.sparse import save_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

In [55]:
# import data
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")

# train_name_countvectorizer
train_name_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
train_name_dic = train_name_countvectorizer.vocabulary_

# train_authors_countvectorizer
train_authors_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))
train_authors_dic = train_authors_countvectorizer.vocabulary_

# train_desc_countvectorizer
train_desc_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
train_desc__dic = train_desc_countvectorizer.vocabulary_

# process vector features
train_name_features = train_name_countvectorizer.transform(train_df['Name'])
train_authors_features = train_authors_countvectorizer.transform(train_df['Authors'])
train_desc_features = train_desc_countvectorizer.transform(train_df['Description'])


In [56]:
#process for test set
test_name_features = train_name_countvectorizer.transform(test_df['Name'])
test_authors_features = train_authors_countvectorizer.transform(test_df['Authors'])
test_desc_features = train_desc_countvectorizer.transform(test_df['Description'])
other_features_df_test = test_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language'])

In [57]:
def docclass_preprocess(train, test, threshold):
    #change some type of class into other to decrease the dimension of matrix
    data = train.value_counts()
    data_test = test.value_counts()
    unfreq_class = []
    for cla in data.index:
        if data[cla] < threshold:
            unfreq_class.append(cla)
    for cla in data_test.index:
        if cla not in data.index:
            test = test.replace(cla, 'others')
    train = train.replace(unfreq_class, 'others')
    train.fillna('others', inplace = True)
    test = test.replace(unfreq_class, 'others')
    test.fillna('others', inplace = True)
    return train, test

In [58]:
#preprocess two features
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
train_df['Language'], test_df['Language'] = docclass_preprocess(train_df['Language'], test_df['Language'], 50)
train_df['Publisher'], test_df['Publisher'] = docclass_preprocess(train_df['Publisher'], test_df['Publisher'], 100)
train_df_spec = train_df[['Language', 'Publisher']]
test_df_spec = test_df[['Language', 'Publisher']]
train_num_spec = ohe.fit_transform(train_df_spec)
test_num_spec = ohe.transform(test_df_spec)
print(train_num_spec.shape)
#train_spec = csr_matrix(train_df_spec.values)



(23063, 25)


In [59]:
other_features_df_train = train_df.drop(columns=['Name', 'Authors', 'Publisher', 'Language', 'Description', 'rating_label'])
other_features_df_test = test_df.drop(columns=['Name', 'Authors', 'Publisher', 'Language', 'Description'])

In [49]:
def x2_feature_selection(X_train, X_test, y_train, feature_num):
    x2 = SelectKBest(chi2, k=feature_num)
    X_train_x2 = x2.fit_transform(X_train,y_train)
    X_test_x2 = x2.transform(X_test)
    return X_train_x2, X_test_x2

In [50]:
# mutualInformation selection
def mi_feature_selection(X_train, X_test, y_train,k):
    selector = SelectKBest(mutual_info_classif, k=k)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    return X_train, X_test

In [51]:
def rfe_selection(X_train, X_test, y_train, k):
    #use a logistic regression model to do recursive feature elimination
    rfe = RFE(estimator=GaussianNB(), n_features_to_select=k)
    X_train = rfe.fit_transform(X_train, y_train)
    X_test = rfe.transform(X_test)
    return X_train, X_test

In [15]:
#do feature selection
y_train = train_df['rating_label']
#use chi2 to select count vectors
#train_name_countvec_x2, test_name_countvec_x2= x2_feature_selection(train_name_features, test_name_features, y_train, 500)
#train_author_countvec_x2, test_author_countvec_x2 = x2_feature_selection(train_authors_features, test_authors_features, y_train, 200)
#train_desc_countvec_x2, test_desc_countvec_x2 = x2_feature_selection(train_desc_features, test_desc_features, y_train, 10000)

#use mi to select count vectors
train_name_countvec_mi, test_name_countvec_mi= mi_feature_selection(train_name_features, test_name_features, y_train, 50)
train_author_countvec_mi, test_author_countvec_mi = mi_feature_selection(train_authors_features, test_authors_features, y_train, 20)
train_desc_countvec_mi, test_desc_countvec_mi = mi_feature_selection(train_desc_features, test_desc_features, y_train, 100)


# new sparse features
sparse_features_train_countvec = hstack([train_name_countvec_mi, train_author_countvec_mi, train_desc_countvec_mi])
sparse_features_test_countvec = hstack([test_name_countvec_mi, test_author_countvec_mi, test_desc_countvec_mi])



In [62]:
# new train features
dense_features_train = csr_matrix(other_features_df_train)
dense_features_test = csr_matrix(other_features_df_test)

In [63]:
#combine all to select
y_train = train_df['rating_label']
sparse_features_train_countvec = hstack([train_name_features, train_authors_features, train_desc_features, train_num_spec, dense_features_train])
sparse_features_test_countvec = hstack([test_name_features, test_authors_features, test_desc_features, test_num_spec, dense_features_test])
train_features, X_test = x2_feature_selection(sparse_features_train_countvec, sparse_features_test_countvec, y_train, 5000)

In [64]:
#feature engineering
norm = Normalizer()
train_name_features = norm.fit_transform(train_features)
test_name_features = norm.transform(X_test)

doc_vec transformation

In [65]:
# process vector features
train_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
train_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
other_features_df = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])
train_features_docvec = pd.concat([train_name_features, train_authors_features, train_desc_features], axis=1)
dense_features_train_docvec = csr_matrix(train_features_docvec.values)

In [9]:
# process test features
test_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
test_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_features_docvec = pd.concat([test_name_features, test_authors_features, test_desc_features], axis=1)
dense_features_test_docvec = csr_matrix(test_features_docvec.values)

combine the docvec and countvec together

In [66]:
# new train features
#train_features = hstack([dense_features_train, train_num_spec])
#X_test = hstack([dense_features_test,test_num_spec])

#train_features = train_author_countvec_mi
#X_test = test_author_countvec_mi

print(train_features.shape)
print(type(train_features))

(23063, 5000)
<class 'scipy.sparse._csr.csr_matrix'>


In [67]:
#output the data preprocessed
save_npz('x_train_5000.npz', train_features)
save_npz('x_test_5000.npz', X_test)