## Import Needed Libraries

In [4]:
import numpy as np
import pandas as pd
import nltk
import re
import logging
import warnings
import os
from pandas import ExcelWriter
import ast
import math
import gdown

# ML Training
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler

# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)


# General settings
pd.set_option('display.max_colwidth', None)
np.random.seed(11)
nltk.download('wordnet')

domain = "Social_Networking"
#domain = "Games"
#domain = "Productivity"

text_representation_url = ""
all_features_url = ""

if domain == "Social_Networking":
    gdown.download(f"https://drive.google.com/uc?id=1lBePyxYUnt07yK196EqmvI6UQub7YTgi", "text_representation.csv", quiet=False)
    all_features_url = "https://drive.usercontent.google.com/uc?id=1w32MbVF9KWLHfYk9dPFtRGqc9sh0Ia4w&export=download"
elif domain == "Games":
    gdown.download(f"https://drive.google.com/uc?id=12ct1U1PEZhp0-N68E-_d_F4E3Fh7S20c", "text_representation.csv", quiet=False)
    all_features_url = "https://drive.usercontent.google.com/uc?id=1m7WECvmlxLlwIBVfdHiB7EQ6QR2v2D7O&export=download"
elif domain == "Productivity":
    gdown.download(f"https://drive.google.com/uc?id=1eYFcPxsHvYm8XCFVJouNnxAwdfeOz6GP", "text_representation.csv", quiet=False)
    all_features_url = "https://drive.usercontent.google.com/uc?id=18657xYO5IVqC3enmMfCYy_RfgEHkvpNb&export=download"

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Downloading...
From (original): https://drive.google.com/uc?id=1lBePyxYUnt07yK196EqmvI6UQub7YTgi
From (redirected): https://drive.google.com/uc?id=1lBePyxYUnt07yK196EqmvI6UQub7YTgi&confirm=t&uuid=eb275c77-465c-4c05-84e3-d739ab1d8a6e
To: /content/text_representation.csv
100%|██████████| 326M/326M [00:04<00:00, 75.4MB/s]


# Load data

In [6]:
# Read data from csv

df_1 = pd.read_csv("text_representation.csv")

df_1['tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['tfidf']]
df_1['w2v'] = [np.array(ast.literal_eval(x)) for x in df_1['w2v']]
df_1['ds_w2v'] = [np.array(ast.literal_eval(x)) for x in df_1['ds_w2v']]
df_1['w2v_tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['w2v_tfidf']]
df_1['ds_w2v_tfidf'] = [np.array(ast.literal_eval(x)) for x in df_1['ds_w2v_tfidf']]
df_1['elmo'] = [np.array(ast.literal_eval(x)) for x in df_1['elmo']]
df_1['bert'] = [np.array(ast.literal_eval(x)) for x in df_1['bert']]
df_1['fine_tuned_bert'] = [np.array(ast.literal_eval(x)) for x in df_1['fine_tuned_bert']]

In [7]:
# All features
df_2 = pd.read_csv(all_features_url)
df_2.drop(['sentence', 'category', 'category_id'], axis='columns', inplace=True)

df = pd.concat([df_1,df_2], axis=1)

## Training

In [9]:
# print columns (features) names
df.columns

Index(['sentence', 'category', 'category_id', 'sentiment', 'sentiment_id',
       'category_id_sentiment_id', 'rating', 'tfidf', 'w2v', 'ds_w2v',
       'w2v_tfidf', 'ds_w2v_tfidf', 'elmo', 'bert', 'fine_tuned_bert',
       'Unnamed: 0', 'sentiment', 'sentiment_id', 'rating', 'sentence_clean',
       'W-negative_Association', 'W-positive_Association', 'num_generic_pos',
       'num_generic_neg', 'num_generic_pos + synonyms',
       'num_generic_neg + synonyms', 'num_tfidf_pos', 'num_tfidf_neg',
       'num_tfidf_pos + synonyms', 'num_tfidf_neg + synonyms', 'num_PMI_pos',
       'num_PMI_neg', 'num_PMI_pos + synonyms', 'num_PMI_neg + synonyms',
       'num_pos_emojis', 'num_neg_emojis', 'aspect_dependent_sentiment_score'],
      dtype='object')

In [None]:
# select features for the ablation study

#df_features_1 = df.iloc[:, 16:32]
#df_features_2 = df.iloc[:, 40:48]
#df_features = pd.concat([df_features_1,df_features_2], axis=1)

df_features = df.iloc[:, 16:]
df_features.columns

In [None]:
%%time

# open excel file to export the results to it
writer = ExcelWriter("Results/ACD/" + domain + "_all.xlsx")

classifiers = {"SVM": LinearSVC(max_iter=100, random_state=11),
               "MLP": MLPClassifier(max_iter=100, random_state=11),
               "DT": DecisionTreeClassifier(random_state=11),
               "GNB": GaussianNB(),
               "LR": LogisticRegression(random_state=11),
               "KNN": KNeighborsClassifier()
              }

text_representations = ["tfidf", "w2v", "ds_w2v", "w2v_tfidf", "ds_w2v_tfidf", "elmo", "bert", "fine_tuned_bert"]

scoring = ['f1_micro']

col_names = ['model']
for x in text_representations:
    col_names.append(x)
output_results = []

for name, classifier in classifiers.items():

    avereged_results = [name]

    for text_representation in text_representations:

        np.random.seed(11)

        print("\ncreate the pipeline for " + name + " and " +  text_representation)

        pipeline = Pipeline([
            ('smote', SMOTE()),
            ('cls', classifier)
        ])


        X = np.concatenate((np.array(df[text_representation].tolist()), df_features.values), axis=1)
        #X = np.array(df[text_representation].tolist())
        Y = df["category_id"]

        srkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=11)
        results = cross_validate(pipeline, X, Y, cv=srkf, scoring=scoring)

        f1 = np.mean(results['test_f1_micro']) * 100
        print('Micro F1-Score:', f1, '%')
        avereged_results.append(f1)
    output_results.append(avereged_results)


# export classifier's results
result_df = pd.DataFrame(output_results, columns=col_names)
result_df.to_excel(writer, index=False)


writer.save()
os.system("printf '\a'") # or '\7'