In [2]:
# import library
import sklearn
import numpy
import pandas as pd
import pickle
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm

In [3]:
# import data
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")

# train_name_countvectorizer
train_name_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
train_name_dic = train_name_countvectorizer.vocabulary_

# train_authors_countvectorizer
train_authors_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))
train_authors_dic = train_authors_countvectorizer.vocabulary_

# train_desc_countvectorizer
train_desc_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
train_desc__dic = train_desc_countvectorizer.vocabulary_

# process vector features
train_name_features = train_name_countvectorizer.transform(train_df['Name'])
train_authors_features = train_authors_countvectorizer.transform(train_df['Authors'])
train_desc_features = train_desc_countvectorizer.transform(train_df['Description'])
other_features_df_train = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])


In [4]:
def x2_feature_selection(X_train, X_test, y_train, feature_num):
    x2 = SelectKBest(chi2, k=feature_num)
    X_train_x2 = x2.fit_transform(X_train,y_train)
    X_test_x2 = x2.transform(X_test)
    return X_train_x2, X_test_x2

In [5]:
#process for test set
test_name_features = train_name_countvectorizer.transform(test_df['Name'])
test_authors_features = train_authors_countvectorizer.transform(test_df['Authors'])
test_desc_features = train_desc_countvectorizer.transform(test_df['Description'])
other_features_df_test = test_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language'])


In [6]:
#do feature selection
y_train = train_df['rating_label']

train_name_countvec_x2, test_name_countvec_x2= x2_feature_selection(train_name_features, test_name_features, y_train, 100)
train_author_countvec_x2, test_author_countvec_x2 = x2_feature_selection(train_authors_features, test_authors_features, y_train, 20)
train_desc_countvec_x2, test_desc_countvec_x2 = x2_feature_selection(train_desc_features, test_desc_features, y_train, 100)

# new sparse features
sparse_features_train_countvec = hstack([train_name_countvec_x2, train_author_countvec_x2, train_desc_countvec_x2])
sparse_features_test_countvec = hstack([test_name_countvec_x2, test_author_countvec_x2, test_desc_countvec_x2])

# new train features
dense_features_train = csr_matrix(other_features_df_train.values)
dense_features_test = csr_matrix(other_features_df_test.values)


next several blocks are for doc2vec transformation

In [7]:
# process vector features
train_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
train_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
train_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
other_features_df = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])
train_features_docvec = pd.concat([train_name_features, train_authors_features, train_desc_features], axis=1)
dense_features_train_docvec = csr_matrix(train_features_docvec.values)

In [8]:
# process test features
test_name_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_authors_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv", index_col = False, delimiter = ',', header=None)
test_desc_features = pd.read_csv(r"./project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_features_docvec = pd.concat([test_name_features, test_authors_features, test_desc_features], axis=1)
dense_features_test_docvec = csr_matrix(test_features_docvec.values)

In [10]:
# new train features
train_features = hstack([sparse_features_train_countvec, dense_features_train, dense_features_train_docvec])
X_test = hstack([sparse_features_test_countvec, dense_features_test, dense_features_test_docvec])
print(sparse_features_train_countvec.shape)

(23063, 220)


In [13]:
# split train and testing dataset
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df["rating_label"], test_size=0.2, random_state=42)
print(X_train.shape)
print(X_train)

(18450, 444)
  (0, 206)	1.0
  (0, 220)	1993.0
  (0, 221)	6.0
  (0, 222)	1.0
  (0, 223)	398.0
  (0, 224)	-0.06392902880907059
  (0, 225)	0.12211732566356658
  (0, 226)	-0.009757831692695616
  (0, 227)	0.07373512536287308
  (0, 228)	-0.006081403698772192
  (0, 229)	-0.01131432130932808
  (0, 230)	0.00944425631314516
  (0, 231)	0.14090394973754883
  (0, 232)	-0.03458491712808609
  (0, 233)	-0.031706225126981735
  (0, 234)	-0.18284228444099423
  (0, 235)	0.003114692401140928
  (0, 236)	-0.11357323080301283
  (0, 237)	-0.08756700903177261
  (0, 238)	0.12100431323051453
  (0, 239)	-0.04909089207649231
  (0, 240)	0.0918167605996132
  (0, 241)	-0.07013675570487976
  (0, 242)	-0.15027473866939545
  (0, 243)	-0.22050502896308896
  :	:
  (18449, 419)	1.019339084625244
  (18449, 420)	-0.10736381262540816
  (18449, 421)	-0.6075002551078796
  (18449, 422)	-0.4448202550411224
  (18449, 423)	-0.6948765516281128
  (18449, 424)	-1.7883001565933228
  (18449, 425)	-0.03296919912099838
  (18449, 426)	0.244

def a function to help output result

In [14]:
def output_predictions(prediction):
    #output result
    output_df = pd.DataFrame({'rating_label': prediction})
    output_df.index += 1
    output_df.index.name = 'id'
    output_df.to_csv('./predictions.csv')
    return

then fit for two model to evaluate the feature selection

In [None]:
lgr = LogisticRegression()
lgr.fit(X_train,y_train)
print("Accuracy:",lgr.score(X_val,y_val))
prediction_logistic = lgr.predict(X_test)
output_predictions(prediction_logistic)

Accuracy: 0.7132018209408194


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter(action='ignore', category=UserWarning)

# Define the parameter grid to search over
param_grid = {'max_iter': [100, 500, 1000, 2000, 3000, 5000]}

# Create a logistic regression classifier
lgr = LogisticRegression()


# Use GridSearchCV to find the best max_iter value
grid_search = GridSearchCV(lgr, param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Print the best parameters and accuracy score
print("Best max_iter value:", grid_search.best_params_['max_iter'])
print("Accuracy:", grid_search.score(X_val, y_val))

Best max_iter value: 5000
Accuracy: 0.7127682636028615


In [None]:
prediction_logistic_grid = grid_search.predict(X_test)
output_predictions(prediction_logistic_grid)

In [None]:
#try a svm model
C = 1.0
SVM_classifier = svm.SVC(kernel='rbf', gamma=0.7, C=C)
SVM_classifier.fit(X_train, y_train)
print(SVM_classifier.score(X_val, y_val))

0.7112508129200087


In [None]:
prediction_svm = SVM_classifier.predict(X_test)
output_predictions(prediction_svm)

In [18]:
#try polynomial kernel for svm
C = 1.0
svm_linear = svm.SVC(kernel='linear', C=C)
svm_linear.fit(X_train, y_train)
print(svm_linear.score(X_val, y_val))

0.7140689356167353


In [17]:
prediction_svm_poly = svm_poly.predict(X_test)
output_predictions(prediction_svm_poly)