Packages:

In [110]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

Pre-Processing:

In [111]:
book_training_file = "project_data_files/book_rating_train.csv"
book_testing_file = "project_data_files/book_rating_test.csv"

train_names_file = "project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv"
train_authors_file = "project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv"
train_desc_file = "project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv"

test_names_file = "project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv"
test_authors_file = "project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv"
test_desc_file = "project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv"

train_data = pd.read_csv(book_training_file)
test_data = pd.read_csv(book_testing_file)

word_training_files = [train_names_file, train_authors_file, train_desc_file]
word_testing_files = [test_names_file, test_authors_file, test_desc_file]

word_train_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_training_files]
word_test_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_testing_files]

Evaluation:

In [112]:
def evaluate(true_labels, predicted_labels):
    confusion = confusion_matrix(true_labels, predicted_labels)
    f1_m = f1_score(true_labels, predicted_labels, average="micro")
    f1_w = f1_score(true_labels, predicted_labels, average="weighted")
    accuracy = accuracy_score(true_labels, predicted_labels)
    r2 = r2_score(true_labels, predicted_labels)

    print("Confusion Matrix :\n", confusion)
    print("Accuracy : ", accuracy)
    print("R2 Score : ", r2)
    print("Micro F1 Score : ", f1_m)
    print("Weighted F1 Score : ", f1_w)

Baseline: 0R

In [113]:
def zero_R(labels):
    ratings, rating_counts = np.unique(labels, return_counts=True)
    num_labels = len(labels)

    probs = [(rating_counts[i] / num_labels, ratings[i]) for i in range(len(ratings))]
    predicted_label = max(probs)[1]
    predicted_labels = [predicted_label] * len(labels)
    return predicted_labels

Comparing:

In [114]:
# 0 -R Baseline
evaluate(train_data['rating_label'], zero_R(train_data['rating_label']))

Confusion Matrix :
 [[    0  5864     0]
 [    0 16208     0]
 [    0   991     0]]
Accuracy :  0.7027706716385552
R2 Score :  -0.1767472937401695
Micro F1 Score :  0.7027706716385552
Weighted F1 Score :  0.5800976316323855


Selected Features and Train-Test Split:

In [115]:
# selected_features = train_data.columns[:-1]
# label = train_data.columns[-1]

# feat_train, feat_valid, label_train, label_valid = train_test_split(train_data[selected_features], train_data[label], test_size=0.2, random_state=1169800)

# text_features = ["Name", "Authors", "Description"]

# vec = CountVectorizer()
# feat_train_transformed_m = [vec.fit_transform(feat_train[f]) for f in text_features]
# feat_valid_fitted_m = [vec.transform(feat_valid[f]) for f in text_features]

In [116]:
# SVM = [SVC()] * len(text_features)
# for feat in range(len(text_features)):
#     SVM[feat].fit(feat_train_transformed_m[feat].toarray(), label_train)

In [117]:
# predicted = []
# for SVM_feat in range(len(SVM)):
#     predicted.append(SVM_feat.predict(feat_valid_fitted_m[SVM_feat]).toarray())

In [118]:
# evaluated = []
# for i in range(len(predicted)):
#     evaluated.append(evaluate(label_valid, predicted[i]))

In [119]:
selected_features = train_data.columns[:-1]
label = train_data.columns[-1]

text_features = ["Name", "Authors", "Description"]
drop = ["Publisher", "Language"]
all_data = train_data[selected_features]
for f in text_features:
    all_data = all_data.drop(f, axis=1)
for f in drop:
    all_data = all_data.drop(f, axis=1)
for i in range(len(word_training_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_train_data[i].columns}
    all_data = all_data.join(word_train_data[i].rename(columns=new_column_names))

In [125]:
feat_train, feat_valid, label_train, label_valid = train_test_split(all_data, train_data[label], test_size=0.2, random_state=0)

Fit SVM:

In [128]:
SVM = SVC()
SVM.fit(all_data, train_data[label])

In [132]:
features_to_use = train_data.columns
features_to_use
text_features = ["Name", "Authors", "Description"]
drop = ["Publisher", "Language"]
test_data2 = test_data
for f in text_features:
    test_data2 = test_data2.drop(f, axis=1)
for f in drop:
    test_data2 = test_data2.drop(f, axis=1)
for i in range(len(word_testing_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_test_data[i].columns}
    test_data2 = test_data2.join(word_test_data[i].rename(columns=new_column_names))
    
predictions = SVM.predict(test_data2)

In [141]:
predict_data = pd.DataFrame({'id':test_data2.index+1, 'rating_label':predictions})
predict_data.to_csv("1169800.csv", index=False)

In [None]:
clf = SVC(kernel='linear', C=1)
scores = cross_val_predict(clf, all_data, train_data[label], cv=5)

Random Forest Classifier:

Predict SVM:

In [127]:
Models = [clf]
predicted_labels = []

for model in Models:
    predicted_labels.append(model.predict(feat_valid))

NameError: name 'clf' is not defined

In [None]:
evaluate(label_valid, predicted_labels[0])

Confusion Matrix :
 [[  75 1073    0]
 [  80 3182    5]
 [   1  189    8]]
Accuracy :  0.7077823542163452
R2 Score :  -0.17441777198875852
Micro F1 Score :  0.7077823542163452
Weighted F1 Score :  0.616382366754909


In [None]:
RFC = RandomForestClassifier()

In [136]:
train_data.index

RangeIndex(start=0, stop=23063, step=1)