In [1]:
# import library
import sklearn
import numpy
import pandas as pd
import pickle
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import svm

In [2]:
# import data
train_df = pd.read_csv("./project_data_files/book_rating_train.csv")
test_df = pd.read_csv("./project_data_files/book_rating_test.csv")

# train_name_countvectorizer
train_name_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
train_name_dic = train_name_countvectorizer.vocabulary_

# train_authors_countvectorizer
train_authors_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_authors_countvectorizer.pkl", "rb"))
train_authors_dic = train_authors_countvectorizer.vocabulary_

# train_desc_countvectorizer
train_desc_countvectorizer = pickle.load(open("./project_data_files/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))
train_desc__dic = train_desc_countvectorizer.vocabulary_

# process vector features
train_name_features = train_name_countvectorizer.transform(train_df['Name'])
train_authors_features = train_authors_countvectorizer.transform(train_df['Authors'])
train_desc_features = train_desc_countvectorizer.transform(train_df['Description'])
other_features_df = train_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language', 'rating_label'])
# new sparse features
sparse_features = hstack([train_name_features, train_authors_features, train_desc_features])
# new train features
dense_features = csr_matrix(other_features_df.values)
train_features = hstack([sparse_features, dense_features])

# split train and testing dataset
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df["rating_label"], test_size=0.2, random_state=42)
print(X_train.shape)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


(18450, 126888)


In [9]:
#process for test set
test_name_features = train_name_countvectorizer.transform(test_df['Name'])
test_authors_features = train_authors_countvectorizer.transform(test_df['Authors'])
test_desc_features = train_desc_countvectorizer.transform(test_df['Description'])
other_features_df = test_df.drop(columns=['Name', 'Authors', 'Description', 'Publisher', 'Language'])

# new sparse features
sparse_features = hstack([test_name_features, test_authors_features, test_desc_features])
# new train features
dense_features = csr_matrix(other_features_df.values)
X_test = hstack([sparse_features, dense_features])

In [19]:
def output_predictions(prediction):
    #output result
    output_df = pd.DataFrame({'rating_label': prediction})
    output_df.index += 1
    output_df.index.name = 'id'
    output_df.to_csv('./predictions.csv')
    return

In [14]:
x2 = SelectKBest(chi2, k=1000)

X_train_x2 = x2.fit_transform(X_train,y_train)
X_val_x2 = x2.transform(X_val)
print(X_train_x2.shape)

(18450, 1000)


In [15]:
lgr = LogisticRegression()
lgr.fit(X_train,y_train)
print("Accuracy:",lgr.score(X_val,y_val))

Accuracy: 0.7106004769130717


In [16]:
lgr = LogisticRegression()
lgr.fit(X_train_x2,y_train)
print("Accuracy:",lgr.score(X_val_x2,y_val))

Accuracy: 0.7123347062649036


In [20]:
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter(action='ignore', category=UserWarning)

# Define the parameter grid to search over
param_grid = {'max_iter': [100, 500, 1000, 5000, 10000]}

# Create a logistic regression classifier
lgr = LogisticRegression()


# Use GridSearchCV to find the best max_iter value
grid_search = GridSearchCV(lgr, param_grid, cv=3)
grid_search.fit(X_train_x2, y_train)

# Print the best parameters and accuracy score
print("Best max_iter value:", grid_search.best_params_['max_iter'])
print("Accuracy:", grid_search.score(X_val_x2, y_val))

Best max_iter value: 10000
Accuracy: 0.7106004769130717


In [22]:
X_test_x2 = x2.transform(X_test)
prediction_logistic = grid_search.predict(X_test_x2)
output_predictions(prediction_logistic)

In [23]:
#try a svm model
C = 1.0
SVM_classifier = svm.SVC(kernel='rbf', gamma=0.7, C=C)
SVM_classifier.fit(X_train_x2, y_train)
print(SVM_classifier.score(X_val_x2, y_val))

0.708649468892261


In [None]:
#try polynomial kernel for svm
#svm_poly = svm.SVC(kernel='poly', degree=3, gamma='auto', C=C))
#svm_poly.fit(X_train_x2, y_train)
#print(svm_poly.score(X_val_x2, y_val))

In [26]:
prediction_svm = SVM_classifier.predict(X_test_x2)
output_predictions(prediction_svm)