Packages:

In [143]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

Pre-Processing:

In [144]:
book_training_file = "project_data_files/book_rating_train.csv"
book_testing_file = "project_data_files/book_rating_test.csv"

train_names_file = "project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv"
train_authors_file = "project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv"
train_desc_file = "project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv"

test_names_file = "project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv"
test_authors_file = "project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv"
test_desc_file = "project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv"

train_data = pd.read_csv(book_training_file)
test_data = pd.read_csv(book_testing_file)

word_training_files = [train_names_file, train_authors_file, train_desc_file]
word_testing_files = [test_names_file, test_authors_file, test_desc_file]

word_train_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_training_files]
word_test_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_testing_files]

Evaluation:

In [177]:
def evaluate(true_labels, predicted_labels):
    confusion = confusion_matrix(true_labels, predicted_labels)
    f1_m = f1_score(true_labels, predicted_labels, average="micro")
    f1_w = f1_score(true_labels, predicted_labels, average="weighted")
    accuracy = accuracy_score(true_labels, predicted_labels)
    r2 = r2_score(true_labels, predicted_labels)

    print("Confusion Matrix :\n", confusion)
    print("Accuracy : ", accuracy)
    print("R2 Score : ", r2)
    print("Micro F1 Score : ", f1_m)
    print("Weighted F1 Score : ", f1_w)
    return [confusion, f1_m, f1_w, accuracy, r2]

def evaluate_kfold(label_set):
    true_labels = [t_labels for (t_labels, p_labels) in label_set]
    predicted_labels = [p_labels for (t_labels, p_labels) in label_set]
    return evaluate(true_labels=true_labels, predicted_labels=predicted_labels)

Baseline: 0R

In [146]:
def zero_R(labels):
    ratings, rating_counts = np.unique(labels, return_counts=True)
    num_labels = len(labels)

    probs = [(rating_counts[i] / num_labels, ratings[i]) for i in range(len(ratings))]
    predicted_label = max(probs)[1]
    predicted_labels = [predicted_label] * len(labels)
    return predicted_labels

Comparing:

In [147]:
# 0 -R Baseline
evaluate(train_data['rating_label'], zero_R(train_data['rating_label']))

Confusion Matrix :
 [[    0  5864     0]
 [    0 16208     0]
 [    0   991     0]]
Accuracy :  0.7027706716385552
R2 Score :  -0.1767472937401695
Micro F1 Score :  0.7027706716385552
Weighted F1 Score :  0.5800976316323855


Selected Features and Train-Test Split:

In [148]:
# selected_features = train_data.columns[:-1]
# label = train_data.columns[-1]

# feat_train, feat_valid, label_train, label_valid = train_test_split(train_data[selected_features], train_data[label], test_size=0.2, random_state=1169800)

# text_features = ["Name", "Authors", "Description"]

# vec = CountVectorizer()
# feat_train_transformed_m = [vec.fit_transform(feat_train[f]) for f in text_features]
# feat_valid_fitted_m = [vec.transform(feat_valid[f]) for f in text_features]

In [149]:
# SVM = [SVC()] * len(text_features)
# for feat in range(len(text_features)):
#     SVM[feat].fit(feat_train_transformed_m[feat].toarray(), label_train)

In [150]:
# predicted = []
# for SVM_feat in range(len(SVM)):
#     predicted.append(SVM_feat.predict(feat_valid_fitted_m[SVM_feat]).toarray())

In [151]:
# evaluated = []
# for i in range(len(predicted)):
#     evaluated.append(evaluate(label_valid, predicted[i]))

K Fold Cross Validation on SVM with all 4 datasets combined (Train, Names, Author, Description):

In [171]:
# Select useful features and remove unnecessary features
selected_features = train_data.columns[:-1]
label = train_data.columns[-1]
text_features = ["Name", "Authors", "Description"]
drop = ["Publisher", "Language"]

# Add names, authors and descriptions datasets
all_data = train_data[selected_features]
for f in text_features:
    all_data = all_data.drop(f, axis=1)

for f in drop:
    all_data = all_data.drop(f, axis=1)
    
for i in range(len(word_training_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_train_data[i].columns}
    all_data = all_data.join(word_train_data[i].rename(columns=new_column_names))

# Use cross validation

CombinedSVM = SVC()
combined_evaluation = []
k_folds = KFold(n_splits=10)
for _, (train_index, validate_index) in enumerate(k_folds.split(all_data)):
    X_train, X_validate = all_data.iloc[train_index], all_data.iloc[validate_index]
    y_train, y_validate = train_data[label].iloc[train_index], train_data[label].iloc[validate_index]
    CombinedSVM.fit(X_train, y_train)
    y_pred = CombinedSVM.predict(X_validate)
    combined_evaluation.append((y_validate, y_pred))

evaluate_kfold(combined_evaluation)

Confusion Matrix :
 [[   0  569    0]
 [   0 1635    0]
 [   0  103    0]]
Accuracy :  0.7087126137841352
R2 Score :  -0.16288964166019082
Micro F1 Score :  0.7087126137841352
Weighted F1 Score :  0.5878970692729888
Confusion Matrix :
 [[   0  598    0]
 [   0 1617    0]
 [   0   92    0]]
Accuracy :  0.7009102730819246
R2 Score :  -0.1916732669857777
Micro F1 Score :  0.7009102730819246
Weighted F1 Score :  0.5776615247571213
Confusion Matrix :
 [[   0  603    0]
 [   0 1595    1]
 [   0  107    1]]
Accuracy :  0.6918075422626788
R2 Score :  -0.17561343757256753
Micro F1 Score :  0.6918075422626788
Weighted F1 Score :  0.5665692007944954
Confusion Matrix :
 [[   0  604    0]
 [   0 1617    0]
 [   0   85    0]]
Accuracy :  0.7012142237640937
R2 Score :  -0.2041428661291289
Micro F1 Score :  0.7012142237640937
Weighted F1 Score :  0.5780593422516134
Confusion Matrix :
 [[   0  604    0]
 [   0 1603    0]
 [   0   99    0]]
Accuracy :  0.6951431049436253
R2 Score :  -0.1866820194525558


In [172]:
test_data2 = test_data
for f in text_features:
    test_data2 = test_data2.drop(f, axis=1)

for f in drop:
    test_data2 = test_data2.drop(f, axis=1)
    
for i in range(len(word_testing_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_test_data[i].columns}
    test_data2 = test_data2.join(word_test_data[i].rename(columns=new_column_names))
    
predictions = CombinedSVM.predict(test_data2)
predict_data = pd.DataFrame({'id':test_data2.index+1, 'rating_label':predictions})
predict_data.to_csv("1169800 CombinedSVM.csv", index=False)

Averaging 3 SVM's:

In [203]:
# Use cross validation
AverageSVM = [SVC(), SVC(), SVC()]
avg_evaluation = {}
summary = []
i=0
k_folds = KFold(n_splits=10)
for data in word_train_data:
    for _, (train_index, validate_index) in enumerate(k_folds.split(data)):
        X_train, X_validate = data.iloc[train_index], data.iloc[validate_index]
        y_train, y_validate = train_data[label].iloc[train_index], train_data[label].iloc[validate_index]
        AverageSVM[i].fit(X_train, y_train)
        y_pred = AverageSVM[i].predict(X_validate)
        if i in avg_evaluation:
            avg_evaluation[i].append((y_validate, y_pred))
        else:
            avg_evaluation[i] = (y_validate, y_pred)
    i+=1

In [204]:
predict_avg_SVM = []
i=0
for data in word_test_data:
    predict_avg_SVM.append(AverageSVM[i].predict(data))
    i+=1

avg_predicted_dataset = pd.DataFrame({index:predict_avg_SVM[index] for index in range(len(predict_avg_SVM))})
avg_predicted_dataset.to_csv("1169800 AvgSVM.csv", index=True)

In [226]:
x = {}
j=0
for (t, p) in avg_evaluation:
    x[j]=[]
    for i in range(10):
        x[j].append((t,p))
    j+=1
for 

{0: [(0       4.0
   1       4.0
   2       4.0
   3       4.0
   4       3.0
          ... 
   2302    4.0
   2303    3.0
   2304    4.0
   2305    4.0
   2306    4.0
   Name: rating_label, Length: 2307, dtype: float64,
   array([4., 4., 4., ..., 4., 4., 4.])),
  (0       4.0
   1       4.0
   2       4.0
   3       4.0
   4       3.0
          ... 
   2302    4.0
   2303    3.0
   2304    4.0
   2305    4.0
   2306    4.0
   Name: rating_label, Length: 2307, dtype: float64,
   array([4., 4., 4., ..., 4., 4., 4.])),
  (0       4.0
   1       4.0
   2       4.0
   3       4.0
   4       3.0
          ... 
   2302    4.0
   2303    3.0
   2304    4.0
   2305    4.0
   2306    4.0
   Name: rating_label, Length: 2307, dtype: float64,
   array([4., 4., 4., ..., 4., 4., 4.])),
  (0       4.0
   1       4.0
   2       4.0
   3       4.0
   4       3.0
          ... 
   2302    4.0
   2303    3.0
   2304    4.0
   2305    4.0
   2306    4.0
   Name: rating_label, Length: 2307, dtype: float64,

Random Forest Classifier:

In [None]:
X_train, X_validate, y_train, y_validate = train_test_split(all_data, train_data[label], test_size=0.2)
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
evaluate(y_validate, RFC.predict(X_validate))

Confusion Matrix :
 [[   7 1138    0]
 [  16 3235    0]
 [   0  217    0]]
Accuracy :  0.7027964448298287
R2 Score :  -0.16649710184328903
Micro F1 Score :  0.7027964448298287
Weighted F1 Score :  0.5844973961111398


[array([[   7, 1138,    0],
        [  16, 3235,    0],
        [   0,  217,    0]], dtype=int64),
 0.7027964448298287,
 0.5844973961111398,
 0.7027964448298287,
 -0.16649710184328903]

In [None]:
RFC_predict = RFC.predict(test_data2)
RFC_predict_data = pd.DataFrame({'id':test_data2.index+1, 'rating_label':RFC_predict})
RFC_predict_data.to_csv("1169800 RFC.csv", index=False)