In [126]:
# import library
import sklearn
import numpy
import pandas as pd
import pickle
from scipy.sparse import hstack
from scipy.sparse import save_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.feature_selection import SelectKBest, chi2

In [127]:
# import data
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")

# train_name_countvectorizer
train_name_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
train_name_dic = train_name_countvectorizer.vocabulary_

# train_authors_countvectorizer
train_authors_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))
train_authors_dic = train_authors_countvectorizer.vocabulary_

# train_desc_countvectorizer
train_desc_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
train_desc__dic = train_desc_countvectorizer.vocabulary_

# process vector features
train_name_features = train_name_countvectorizer.transform(train_df['Name'])
train_authors_features = train_authors_countvectorizer.transform(train_df['Authors'])
train_desc_features = train_desc_countvectorizer.transform(train_df['Description'])


In [128]:
#process for test set
test_name_features = train_name_countvectorizer.transform(test_df['Name'])
test_authors_features = train_authors_countvectorizer.transform(test_df['Authors'])
test_desc_features = train_desc_countvectorizer.transform(test_df['Description'])
other_features_df_test = test_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language'])

In [129]:
def docclass_preprocess(train, test, threshold):
    #change some type of class into other to decrease the dimension of matrix
    data = train.value_counts()
    data_test = test.value_counts()
    unfreq_class = []
    for cla in data.index:
        if data[cla] < threshold:
            unfreq_class.append(cla)
    for cla in data_test.index:
        if cla not in data.index:
            test = test.replace(cla, 'others')
    train = train.replace(unfreq_class, 'others')
    train.fillna('others', inplace = True)
    test = test.replace(unfreq_class, 'others')
    test.fillna('others', inplace = True)
    return train, test

In [130]:
#preprocess two features
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
train_df['Language'], test_df['Language'] = docclass_preprocess(train_df['Language'], test_df['Language'], 50)
train_df['Publisher'], test_df['Publisher'] = docclass_preprocess(train_df['Publisher'], test_df['Publisher'], 100)
train_df_spec = train_df[['Language', 'Publisher']]
test_df_spec = test_df[['Language', 'Publisher']]
train_num_spec = ohe.fit_transform(train_df_spec)
test_num_spec = ohe.transform(test_df_spec)
print(train_num_spec.shape)
#train_spec = csr_matrix(train_df_spec.values)



(23063, 25)


In [131]:
other_features_df_train = train_df.drop(columns=['Name', 'Authors', 'Publisher', 'Language', 'Description', 'rating_label'])

In [132]:
def x2_feature_selection(X_train, X_test, y_train, feature_num):
    x2 = SelectKBest(chi2, k=feature_num)
    X_train_x2 = x2.fit_transform(X_train,y_train)
    X_test_x2 = x2.transform(X_test)
    return X_train_x2, X_test_x2

In [133]:
#do feature selection
y_train = train_df['rating_label']

train_name_countvec_x2, test_name_countvec_x2= x2_feature_selection(train_name_features, test_name_features, y_train, 100)
train_author_countvec_x2, test_author_countvec_x2 = x2_feature_selection(train_authors_features, test_authors_features, y_train, 20)
train_desc_countvec_x2, test_desc_countvec_x2 = x2_feature_selection(train_desc_features, test_desc_features, y_train, 100)

# new sparse features
sparse_features_train_countvec = hstack([train_name_countvec_x2, train_author_countvec_x2, train_desc_countvec_x2])
sparse_features_test_countvec = hstack([test_name_countvec_x2, test_author_countvec_x2, test_desc_countvec_x2])

# new train features
dense_features_train = csr_matrix(other_features_df_train.values)
dense_features_test = csr_matrix(other_features_df_test.values)

doc_vec transformation

In [134]:
# process vector features
train_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
train_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
other_features_df = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])
train_features_docvec = pd.concat([train_name_features, train_authors_features, train_desc_features], axis=1)
dense_features_train_docvec = csr_matrix(train_features_docvec.values)

In [135]:
# process test features
test_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
test_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_features_docvec = pd.concat([test_name_features, test_authors_features, test_desc_features], axis=1)
dense_features_test_docvec = csr_matrix(test_features_docvec.values)

combine the docvec and countvec together

In [136]:
# new train features
train_features = hstack([sparse_features_train_countvec, dense_features_train, dense_features_train_docvec, train_num_spec])
X_test = hstack([sparse_features_test_countvec, dense_features_test, dense_features_test_docvec, test_num_spec])
print(train_features.shape)
print(type(train_features))

(23063, 469)
<class 'scipy.sparse._csr.csr_matrix'>


In [137]:
#output the data preprocessed
save_npz('x_train_combination_469.npz', train_features)
save_npz('x_test_469.npz', X_test)