## Import Needed Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import logging
import warnings
import os
from collections import defaultdict
from sklearn.model_selection import train_test_split
from pandas import ExcelWriter
import ast

# ML Training 
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate

# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# hide warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

# General settings
pd.set_option('display.max_colwidth', None)
np.random.seed(11)
nltk.download('wordnet')

#domain = "Social_Networking"
#domain = "Games"
domain = "Productivity"

if domain == "Social_Networking":
    text_representation_url = "https://drive.google.com/uc?id=1-obCouGNTpXLS0UPbwazeuZMNRQCpO_V&export=download"
    all_features_url = "https://drive.google.com/uc?id=/1QGRqvdnVUst9BQh3_KetuRcrRxWZZSxR&export=download"
elif domain == "Games":
    text_representation_url = "https://drive.google.com/uc?id=1mh6gURnnFJ-KeBRpmF9gHjtGhoQ2D6OI&export=download"
    all_features_url = "https://drive.google.com/uc?id=15zkGkJe5SXA6jEaxUiUE5OQyOC0A7uHg&export=download"
elif domain == "Productivity":
    text_representation_url = "https://drive.google.com/uc?id=1AT7-wtIblIlY6HZYzwyQZy_cQWoh8pvE&export=download"
    all_features_url = "https://drive.google.com/uc?id=0t2J8cXcII_4VgGzcvFgSPTD0PekqcRC&export=download"

# Load training & testing data

In [None]:
# Read data from csv
df_1 = pd.read_csv(text_representation_url)

df_1['tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['tfidf']]
df_1['w2v'] = [np.array(ast.literal_eval(x)) for x in df_1['w2v']]
df_1['ds_w2v'] = [np.array(ast.literal_eval(x)) for x in df_1['ds_w2v']]
df_1['w2v_tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['w2v_tfidf']]
df_1['ds_w2v_tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['ds_w2v_tfidf']]
df_1['elmo'] = [np.array(ast.literal_eval(x)) for x in df_1['elmo']]
df_1['bert'] = [np.array(ast.literal_eval(x)) for x in df_1['bert']]
df_1['fine_tuned_bert'] = [np.array(ast.literal_eval(x)) for x in df_1['fine_tuned_bert']]

In [None]:
# All Features
df_2 = pd.read_csv(all_features_url)
df_2.drop(['sentence', 'sentence_clean', 'rating', 'sentiment', 'sentiment_id', 'category', 'category_id'], axis='columns', inplace=True)

df = pd.concat([df_1,df_2], axis=1)

In [None]:
# Re-arrange columns

df = df[['sentence', 'sentiment', 'sentiment_id', 'tfidf', 'w2v', 'ds_w2v',
       'w2v_tfidf', 'ds_w2v_tfidf', 'elmo', 'bert', 'fine_tuned_bert', 'W-negative_Association',
       'W-positive_Association', 'num_generic_pos', 'num_generic_neg',
       'num_tfidf_pos', 'num_tfidf_neg','num_PMI_pos', 'num_PMI_neg',
       'num_tfidf_pos + synonyms','num_tfidf_neg + synonyms', 
       'num_PMI_pos + synonyms', 'num_PMI_neg + synonyms', 
       'num_generic_pos + synonyms', 'num_generic_neg + synonyms',
       'num_pos_emojis','num_neg_emojis', 'rating', 'category_id', 'aspect_dependent_sentiment_score']]

## Training

In [None]:
# print columns (features) names
df.columns

In [None]:
# select features for the ablation study 

df_features_1 = df.iloc[:, 11:13]
df_features_2 = df.iloc[:, 17:19]
df_features_3 = df.iloc[:, 27:]
df_features = pd.concat([df_features_1,df_features_2,df_features_3], axis=1)

#df_features = df.iloc[:, 11:]
df_features.columns

In [None]:
%%time

# open excel file to export the results to it
writer = ExcelWriter("Results/ACP/" + domain + "_best.xlsx")


classifiers = {"SVM": LinearSVC(max_iter=100, random_state=11),
              "MLP": MLPClassifier(max_iter=100, random_state=11),
               "DT": DecisionTreeClassifier(random_state=11),
               "GNB": GaussianNB(),
               "LR": LogisticRegression(random_state=11),
               "KNN": KNeighborsClassifier()
              }

text_representations = ["tfidf", "w2v", "ds_w2v", "w2v_tfidf", "ds_w2v_tfidf", "elmo", "bert", "fine_tuned_bert"]

scoring = ['accuracy']

col_names = ['model']
for x in text_representations:
    col_names.append(x)
output_results = []

for name, classifier in classifiers.items():

    avereged_results = [name]
    
    for text_representation in text_representations:
        
        np.random.seed(11)

        print("\ncreate the pipeline for " + name + " and " +  text_representation)

        X = np.concatenate((np.array(df[text_representation].tolist()), df_features.values), axis=1)
        #X = np.array(df[text_representation].tolist())
        Y = df["sentiment_id"]
        
        rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=11)
        results = cross_validate(classifier, X, Y, cv=rskf, scoring=scoring) 
        
        acc = np.mean(results['test_accuracy']) * 100
        print('\nAccuracy:', acc, '%')
        avereged_results.append(acc)
    output_results.append(avereged_results)
    
# export classifier's results
result_df = pd.DataFrame(output_results, columns=col_names)
result_df.to_excel(writer, index=False)

writer.save()
os.system("printf '\a'") # or '\7'