Packages:

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

Pre-Processing:

In [12]:
book_training_file = "project_data_files/book_rating_train.csv"
book_testing_file = "project_data_files/book_rating_test.csv"

train_names_file = "project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv"
train_authors_file = "project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv"
train_desc_file = "project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv"

test_names_file = "project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv"
test_authors_file = "project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv"
test_desc_file = "project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv"

train_data = pd.read_csv(book_training_file)
test_data = pd.read_csv(book_testing_file)

word_training_files = [train_names_file, train_authors_file, train_desc_file]
word_testing_files = [test_names_file, test_authors_file, test_desc_file]

word_train_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_training_files]
word_test_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_testing_files]

Evaluation:

In [13]:
def evaluate(true_labels, predicted_labels):
    confusion = confusion_matrix(true_labels, predicted_labels)
    f1_m = f1_score(true_labels, predicted_labels, average="micro")
    f1_w = f1_score(true_labels, predicted_labels, average="weighted")
    accuracy = accuracy_score(true_labels, predicted_labels)
    r2 = r2_score(true_labels, predicted_labels)

    print("Confusion Matrix :\n", confusion)
    print("Accuracy : ", accuracy)
    print("R2 Score : ", r2)
    print("Micro F1 Score : ", f1_m)
    print("Weighted F1 Score : ", f1_w)

Baseline: 0R

In [14]:
def zero_R(labels):
    ratings, rating_counts = np.unique(labels, return_counts=True)
    num_labels = len(labels)

    probs = [(rating_counts[i] / num_labels, ratings[i]) for i in range(len(ratings))]
    predicted_label = max(probs)[1]
    predicted_labels = [predicted_label] * len(labels)
    return predicted_labels

Comparing:

In [15]:
# 0 -R Baseline
evaluate(train_data['rating_label'], zero_R(train_data['rating_label']))

Confusion Matrix :
 [[    0  5864     0]
 [    0 16208     0]
 [    0   991     0]]
Accuracy :  0.7027706716385552
R2 Score :  -0.1767472937401695
Micro F1 Score :  0.7027706716385552
Weighted F1 Score :  0.5800976316323855


Selected Features and Train-Test Split:

In [16]:
# selected_features = train_data.columns[:-1]
# label = train_data.columns[-1]

# feat_train, feat_valid, label_train, label_valid = train_test_split(train_data[selected_features], train_data[label], test_size=0.2, random_state=1169800)

# text_features = ["Name", "Authors", "Description"]

# vec = CountVectorizer()
# feat_train_transformed_m = [vec.fit_transform(feat_train[f]) for f in text_features]
# feat_valid_fitted_m = [vec.transform(feat_valid[f]) for f in text_features]

In [17]:
# SVM = [SVC()] * len(text_features)
# for feat in range(len(text_features)):
#     SVM[feat].fit(feat_train_transformed_m[feat].toarray(), label_train)

In [18]:
# predicted = []
# for SVM_feat in range(len(SVM)):
#     predicted.append(SVM_feat.predict(feat_valid_fitted_m[SVM_feat]).toarray())

In [19]:
# evaluated = []
# for i in range(len(predicted)):
#     evaluated.append(evaluate(label_valid, predicted[i]))

In [93]:
selected_features = train_data.columns[:-1]
label = train_data.columns[-1]

text_features = ["Name", "Authors", "Description"]
drop = ["Publisher", "Language"]
all_data = train_data[selected_features]
for f in text_features:
    all_data = all_data.drop(f, axis=1)
for f in drop:
    all_data = all_data.drop(f, axis=1)
for i in range(len(word_testing_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_train_data[i].columns}
    all_data = all_data.join(word_train_data[i].rename(columns=new_column_names))

In [94]:
feat_train, feat_valid, label_train, label_valid = train_test_split(all_data, train_data[label], test_size=0.2, random_state=1169800)

Fit SVM:

In [95]:
SVM = SVC()
SVM.fit(feat_train, label_train)

Random Forest Classifier:

In [96]:
RFC = RandomForestClassifier()

Predict SVM:

In [102]:
Models = [SVM]
predicted_labels = []

for model in Models:
    predicted_labels.append(model.predict(feat_valid))

In [103]:
evaluate(label_valid, predicted_labels[0])

Confusion Matrix :
 [[   0 1169    0]
 [   0 3235    0]
 [   0  209    0]]
Accuracy :  0.7012789941469759
R2 Score :  -0.16956406066183694
Micro F1 Score :  0.7012789941469759
Weighted F1 Score :  0.5781441248892627
