Packages:

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

Pre-Processing:

In [2]:
book_training_file = "project_data_files/book_rating_train.csv"
book_testing_file = "project_data_files/book_rating_test.csv"

train_names_file = "project_data_files/book_text_features_doc2vec/train_name_doc2vec100.csv"
train_authors_file = "project_data_files/book_text_features_doc2vec/train_authors_doc2vec20.csv"
train_desc_file = "project_data_files/book_text_features_doc2vec/train_desc_doc2vec100.csv"

test_names_file = "project_data_files/book_text_features_doc2vec/test_name_doc2vec100.csv"
test_authors_file = "project_data_files/book_text_features_doc2vec/test_authors_doc2vec20.csv"
test_desc_file = "project_data_files/book_text_features_doc2vec/test_desc_doc2vec100.csv"

train_data = pd.read_csv(book_training_file)
test_data = pd.read_csv(book_testing_file)

word_training_files = [train_names_file, train_authors_file, train_desc_file]
word_testing_files = [test_names_file, test_authors_file, test_desc_file]

word_train_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_training_files]
word_test_data = [pd.read_csv(filename, index_col = False, delimiter = ',', header=None) for filename in word_testing_files]

In [None]:
#"all_data" Dataset:
# Select useful features and remove unnecessary features
selected_features = train_data.columns[:-1]
label = train_data.columns[-1]
text_features = ["Name", "Authors", "Description"]
drop = ["Publisher", "Language"]

# Add names, authors and descriptions datasets
all_data = train_data[selected_features]
for f in text_features:
    all_data = all_data.drop(f, axis=1)

for f in drop:
    all_data = all_data.drop(f, axis=1)
    
for i in range(len(word_training_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_train_data[i].columns}
    all_data = all_data.join(word_train_data[i].rename(columns=new_column_names))

#"test2" dataset corresponding to "all_data"
test_data2 = test_data
for f in text_features:
    test_data2 = test_data2.drop(f, axis=1)

for f in drop:
    test_data2 = test_data2.drop(f, axis=1)
    
for i in range(len(word_testing_files)):
    new_column_names = {x:text_features[i] + str(x) for x in word_test_data[i].columns}
    test_data2 = test_data2.join(word_test_data[i].rename(columns=new_column_names))


#"strings_data" Dataset:
new_column_names = {x:text_features[0] + str(x) for x in word_train_data[0].columns}
strings_data = word_train_data[0].rename(columns=new_column_names)
for i in range(len(word_testing_files)-1):
    new_column_names = {x:text_features[i+1] + str(x) for x in word_train_data[i+1].columns}
    strings_data = strings_data.join(word_train_data[i+1].rename(columns=new_column_names))

#"strings_data_test" dataset corresponding to "strings_data"
new_column_names = {x:text_features[0] + str(x) for x in word_test_data[0].columns}
strings_data_test = word_test_data[0].rename(columns=new_column_names)
for i in range(len(word_testing_files)-1):
    new_column_names = {x:text_features[i+1] + str(x) for x in word_test_data[i+1].columns}
    strings_data_test = strings_data_test.join(word_test_data[i+1].rename(columns=new_column_names))

Evaluation:

In [3]:
def evaluate(true_labels, predicted_labels):
    confusion = confusion_matrix(true_labels, predicted_labels)
    f1_m = f1_score(true_labels, predicted_labels, average="micro")
    f1_w = f1_score(true_labels, predicted_labels, average="weighted")
    accuracy = accuracy_score(true_labels, predicted_labels)
    r2 = r2_score(true_labels, predicted_labels)

    print("Confusion Matrix :\n", confusion)
    print("Accuracy : ", accuracy)
    print("R2 Score : ", r2)
    print("Micro F1 Score : ", f1_m)
    print("Weighted F1 Score : ", f1_w)
    return [confusion, f1_m, f1_w, accuracy, r2]

def evaluate_kfold(label_set):
    true_labels = [t_labels.tolist() for (t_labels, p_labels) in label_set]
    t_labels = []
    for x in true_labels:
        t_labels.extend(x)
    predicted_labels = [p_labels.tolist() for (t_labels, p_labels) in label_set]
    p_labels = []
    for x in predicted_labels:
        p_labels.extend(x)
    return evaluate(true_labels=t_labels, predicted_labels=p_labels)

Baseline: 0R

In [4]:
def zero_R(labels):
    ratings, rating_counts = np.unique(labels, return_counts=True)
    num_labels = len(labels)

    probs = [(rating_counts[i] / num_labels, ratings[i]) for i in range(len(ratings))]
    predicted_label = max(probs)[1]
    predicted_labels = [predicted_label] * len(labels)
    return predicted_labels

Comparing:

In [5]:
# 0 -R Baseline
_0R = evaluate(train_data['rating_label'], zero_R(train_data['rating_label']))

Confusion Matrix :
 [[    0  5864     0]
 [    0 16208     0]
 [    0   991     0]]
Accuracy :  0.7027706716385552
R2 Score :  -0.1767472937401695
Micro F1 Score :  0.7027706716385552
Weighted F1 Score :  0.5800976316323855


K Fold Cross Validation on SVM with all 4 datasets combined (Train, Names, Author, Description):

In [6]:
# Use cross validation
CombinedSVM = SVC()
combined_evaluation = []
k_folds = KFold(n_splits=10)
for _, (train_index, validate_index) in enumerate(k_folds.split(all_data)):
    X_train, X_validate = all_data.iloc[train_index], all_data.iloc[validate_index]
    y_train, y_validate = train_data[label].iloc[train_index], train_data[label].iloc[validate_index]
    CombinedSVM.fit(X_train, y_train)
    y_pred = CombinedSVM.predict(X_validate)
    combined_evaluation.append((y_validate, y_pred))

combined = evaluate_kfold(combined_evaluation)

Confusion Matrix :
 [[    0  5864     0]
 [    0 16205     3]
 [    0   990     1]]
Accuracy :  0.702683952651433
R2 Score :  -0.17709061898998435
Micro F1 Score :  0.702683952651433
Weighted F1 Score :  0.580135711234597


In [7]:
predictions = CombinedSVM.predict(test_data2)
predict_data = pd.DataFrame({'id':test_data2.index+1, 'rating_label':predictions})
predict_data.to_csv("1169800 CombinedSVM.csv", index=False)

In [8]:
# Use cross validation
StringsSVM = SVC()
strings_evaluation = []
k_folds = KFold(n_splits=10)
for _, (train_index, validate_index) in enumerate(k_folds.split(strings_data)):
    X_train, X_validate = strings_data.iloc[train_index], strings_data.iloc[validate_index]
    y_train, y_validate = train_data[label].iloc[train_index], train_data[label].iloc[validate_index]
    StringsSVM.fit(X_train, y_train)
    y_pred = StringsSVM.predict(X_validate)
    strings_evaluation.append((y_validate, y_pred))

strings_only = evaluate_kfold(strings_evaluation)

Confusion Matrix :
 [[   71  5793     0]
 [   30 16178     0]
 [    0   991     0]]
Accuracy :  0.704548410874561
R2 Score :  -0.1697091261189665
Micro F1 Score :  0.704548410874561
Weighted F1 Score :  0.5865697187363557


Averaging 3 SVM's:

In [9]:
# Use cross validation
AverageSVM = [SVC(), SVC(), SVC()]
avg_evaluation = {}
summary = []
i=0
k_folds = KFold(n_splits=10)
for data in word_train_data:
    for _, (train_index, validate_index) in enumerate(k_folds.split(data)):
        X_train, X_validate = data.iloc[train_index], data.iloc[validate_index]
        y_train, y_validate = train_data[label].iloc[train_index], train_data[label].iloc[validate_index]
        AverageSVM[i].fit(X_train, y_train)
        y_pred = AverageSVM[i].predict(X_validate)
        if i in avg_evaluation:
            avg_evaluation[i].append((y_validate, y_pred))
        else:
            avg_evaluation[i] = [(y_validate, y_pred)]
    i+=1

In [10]:
predict_avg_SVM = []
i=0
for data in word_test_data:
    predict_avg_SVM.append(AverageSVM[i].predict(data))
    i+=1

avg_predicted_dataset = pd.DataFrame({index:predict_avg_SVM[index] for index in range(len(predict_avg_SVM))})

In [11]:
summary_dict = {}
for index in avg_evaluation.keys():
    summary_dict[AverageSVM[index]] = evaluate_kfold(avg_evaluation[index])

Confusion Matrix :
 [[   36  5828     0]
 [   17 16191     0]
 [    0   991     0]]
Accuracy :  0.7035945020162164
R2 Score :  -0.17348570386692908
Micro F1 Score :  0.7035945020162164
Weighted F1 Score :  0.5833662411730984
Confusion Matrix :
 [[   11  5853     0]
 [   16 16192     0]
 [    2   989     0]]
Accuracy :  0.7025538741707497
R2 Score :  -0.1786355826141508
Micro F1 Score :  0.7025538741707497
Weighted F1 Score :  0.5809024644274099
Confusion Matrix :
 [[   72  5792     0]
 [   30 16178     0]
 [    0   991     0]]
Accuracy :  0.7045917703681222
R2 Score :  -0.16953746349405918
Micro F1 Score :  0.7045917703681222
Weighted F1 Score :  0.5866687613763812


In [12]:
final_avg_values = []
for instance in avg_predicted_dataset.iterrows():
    ratings, counts = np.unique([instance[1][i] for i in range(len(AverageSVM))], return_counts=True)
    val = sorted([(counts[i], ratings[i]) for i in range(len(ratings))])
    final_avg_values.append(val[0][1])

avg_SVM_predicted = pd.DataFrame({'id':avg_predicted_dataset.index+1, 'rating_label':final_avg_values})
avg_SVM_predicted.to_csv("1169800 AvgSVM.csv", index=False)

In [13]:
highest_accuracy_SVM = AverageSVM[2].predict(word_test_data[2])
highest_accuracy_SVM_predicted = pd.DataFrame({'id':word_test_data[2].index+1, 'rating_label':highest_accuracy_SVM})
highest_accuracy_SVM_predicted.to_csv("1169800 AccurateSVM.csv", index=False)

Random Forest Classifier:

In [14]:
X_train, X_validate, y_train, y_validate = train_test_split(strings_data, train_data[label], test_size=0.2)
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)
RFC_summary = evaluate(y_validate, RFC.predict(X_validate))

Confusion Matrix :
 [[   7 1209    0]
 [  10 3213    0]
 [   0  174    0]]
Accuracy :  0.6980273141122914
R2 Score :  -0.20644758299654598
Micro F1 Score :  0.6980273141122914
Weighted F1 Score :  0.5771972499416257


In [16]:
RFC_predict = RFC.predict(strings_data_test)
RFC_predict_data = pd.DataFrame({'id':strings_data_test.index+1, 'rating_label':RFC_predict})
RFC_predict_data.to_csv("1169800 RFC.csv", index=False)

Use all for Logistic Regression:

In [19]:
estimators = [('rfc', RandomForestClassifier()), ('svm', SVC())]
X_train_stack, X_test_stack, y_train_stack, y_test_stack = train_test_split(strings_data, train_data[label])
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=10, n_jobs=-1)
stack.fit(X_train_stack, y_train_stack)
stack_eval = evaluate(y_test_stack, stack.predict(X_test_stack))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix :
 [[  14 1396    0]
 [   5 4098    0]
 [   0  253    0]]
Accuracy :  0.7131460284425946
R2 Score :  -0.15596635188272212
Micro F1 Score :  0.7131460284425946
Weighted F1 Score :  0.5968881287313864


In [18]:
Stacked_predict = stack.predict(strings_data_test)
Stacked_predict_data = pd.DataFrame({'id':strings_data_test.index+1, 'rating_label':Stacked_predict})
Stacked_predict_data.to_csv("1169800 Stack.csv", index=False)