In [1]:
import argparse
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from scipy.sparse.construct import vstack
from features import get_dataframe, update_text, update_ngrams, update_lexicon, upadate_linguistic, update_user, get_features, get_lable
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', required=False, default='data/train.jsonl',
                    help='Full path to the training file')
parser.add_argument('--test', dest='test', required=False, default='data/val.jsonl',
                    help='Full path to the evaluation file')
parser.add_argument('--user_data', dest='user_data', required=False, default='data/users.json',
                    help='Full path to the user data file')
parser.add_argument('--model', dest='model', required=False, default='Ngram+Lex+Ling+User',
                    choices=["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"],
                    help='The name of the model to train and evaluate.')
parser.add_argument('--lexicon_path', dest='lexicon_path', required=False, default='lexica/',
                    help='The full path to the directory containing the lexica.'
                            ' The last folder of this path should be "lexica".')
parser.add_argument('--outfile', dest='outfile', required=False, default='out.txt',
                    help='Full path to the file we will write the model predictions')
                    
args = parser.parse_args("")

In [3]:
df_train_loc = os.path.join('df_train.pkl')
df_test_loc = os.path.join('df_test.pkl')


if os.path.isfile(df_train_loc) and os.path.isfile(df_test_loc):
    df_train = pd.read_pickle(df_train_loc)
    df_test = pd.read_pickle(df_test_loc)

else:

    start = time.time()

    df_train, df_test = get_dataframe(args.train, args.test)
    update_text(df_train, df_test)
    update_ngrams(df_train, df_test)
    update_lexicon(df_train, df_test, args.lexicon_path)
    upadate_linguistic(df_train, df_test)
    update_user(df_train, df_test, args.user_data)

    end = time.time()
    print("Data Preprocessiong Cost:", round(end - start),'s.')

    df_train.to_pickle(df_train_loc)
    df_test.to_pickle(df_test_loc)

In [4]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']


In [5]:
print(len(df_train_r), len(df_train_nr), len(df_test_r), len(df_test_nr))

370 1222 93 306


In [6]:
import itertools
def get_all_combinations(l : list) -> list:
    ll = []
    for L in range(0, len(l)+1):
        for subset in itertools.combinations(l, L):
            ll.append(list(subset))
    return ll


In [7]:
column_names = ["Religion", "lex_list", "ling_list", "user_list", "scores", "mean_score"]
df_record = pd.DataFrame(columns = column_names)

# triversal all possible
for lex_list in get_all_combinations(["CL", "NVL"]):
    for ling_list in get_all_combinations(["Length","Modals","Questions", "Links"]):
        for user_list in get_all_combinations(["Gender", "RI"]):
            religion = True

            for df_train, df_test in [[df_train_r, df_test_r], [df_train_nr, df_test_nr]]:

                if religion:
                    print("\n==================Religion==================")
                else:
                    print("\n==================Non-Religion==================")

                x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lex_list, ling_list=ling_list, user_list=user_list)


                y_train, y_test = get_lable(df_train, df_test)
                print('total features:', x_train.shape[1])

                x = vstack([x_train,x_test])
                y = y_train + y_test
                clf = LogisticRegression(solver='liblinear')
                start = time.time()
                scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
                end = time.time()
                print("Training Model Cost:", round(end - start),'s.')
                print(scores)
                mean_score = np.mean(scores)
                print("CV mean:", mean_score)

                record = {"Religion":religion, "lex_list":lex_list, "ling_list":ling_list, "user_list":user_list, "scores":scores, "mean_score":mean_score}
                df_record = df_record.append(record,ignore_index=True)

                religion = False

df_record.to_csv(os.path.join('df_record.csv'))
print("Wrote record to df_record.csv.")


Lexicon used: 
Linguistic features: 
User features: 
total features: 2000
Training Model Cost: 0 s.
[0.66666667 0.69892473 0.68817204 0.64130435 0.7173913 ]
CV mean: 0.6824918186068256

Lexicon used: 
Linguistic features: 
User features: 
total features: 2000
Training Model Cost: 0 s.
[0.71895425 0.69281046 0.71568627 0.71147541 0.73770492]
CV mean: 0.7153262616522019

Lexicon used: 
Linguistic features: 
User features: Gender 
total features: 2018
Training Model Cost: 0 s.
[0.66666667 0.69892473 0.68817204 0.64130435 0.7173913 ]
CV mean: 0.6824918186068256

Lexicon used: 
Linguistic features: 
User features: Gender 
total features: 2018
Training Model Cost: 0 s.
[0.71895425 0.69281046 0.71568627 0.71147541 0.73770492]
CV mean: 0.7153262616522019

Lexicon used: 
Linguistic features: 
User features: Religious_Ideology 
total features: 2146
Training Model Cost: 0 s.
[0.66666667 0.69892473 0.68817204 0.64130435 0.7173913 ]
CV mean: 0.6824918186068256

Lexicon used: 
Linguistic features: 



Training Model Cost: 9 s.
[0.76143791 0.76470588 0.73202614 0.73770492 0.75737705]
CV mean: 0.7506503803707275

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Gender 
total features: 2034
Training Model Cost: 3 s.
[0.70967742 0.70967742 0.62365591 0.56521739 0.77173913]
CV mean: 0.6759934548854605

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Gender 
total features: 2034




Training Model Cost: 17 s.
[0.76143791 0.76470588 0.73202614 0.73770492 0.75737705]
CV mean: 0.7506503803707275

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Religious_Ideology 
total features: 2162
Training Model Cost: 2 s.
[0.70967742 0.70967742 0.62365591 0.56521739 0.77173913]
CV mean: 0.6759934548854605

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Religious_Ideology 
total features: 2162




Training Model Cost: 12 s.
[0.76143791 0.76470588 0.73202614 0.73770492 0.75737705]
CV mean: 0.7506503803707275

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Gender Religious_Ideology 
total features: 2180
Training Model Cost: 3 s.
[0.70967742 0.70967742 0.62365591 0.56521739 0.77173913]
CV mean: 0.6759934548854605

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals 
User features: Gender Religious_Ideology 
total features: 2180




Training Model Cost: 12 s.
[0.76143791 0.76470588 0.73202614 0.73770492 0.75737705]
CV mean: 0.7506503803707275

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: 
total features: 2016
Training Model Cost: 5 s.
[0.70967742 0.69892473 0.6344086  0.61956522 0.7826087 ]
CV mean: 0.68903693314633

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: 
total features: 2016




Training Model Cost: 17 s.
[0.75816993 0.76470588 0.75816993 0.75409836 0.76721311]
CV mean: 0.7604714454087645

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Gender 
total features: 2034
Training Model Cost: 3 s.
[0.70967742 0.69892473 0.6344086  0.61956522 0.7826087 ]
CV mean: 0.68903693314633

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Gender 
total features: 2034




Training Model Cost: 17 s.
[0.75816993 0.76470588 0.75816993 0.75409836 0.76721311]
CV mean: 0.7604714454087645

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Religious_Ideology 
total features: 2162
Training Model Cost: 9 s.
[0.70967742 0.69892473 0.6344086  0.61956522 0.7826087 ]
CV mean: 0.68903693314633

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Religious_Ideology 
total features: 2162




Training Model Cost: 18 s.
[0.75816993 0.76470588 0.75816993 0.75409836 0.76721311]
CV mean: 0.7604714454087645

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Gender Religious_Ideology 
total features: 2180
Training Model Cost: 2 s.
[0.70967742 0.69892473 0.6344086  0.61956522 0.7826087 ]
CV mean: 0.68903693314633

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions 
User features: Gender Religious_Ideology 
total features: 2180




Training Model Cost: 15 s.
[0.75816993 0.76470588 0.75816993 0.75409836 0.76721311]
CV mean: 0.7604714454087645

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Links 
User features: 
total features: 2016
Training Model Cost: 3 s.
[0.68817204 0.72043011 0.65591398 0.63043478 0.73913043]
CV mean: 0.6868162692847125

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Links 
User features: 
total features: 2016
Training Model Cost: 9 s.
[0.7745098  0.74836601 0.77777778 0.7442623  0.73770492]
CV mean: 0.7565241615771992

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Links 
User features: Gender 
total features: 2034
Training Model Cost: 2 s.
[0.68817204 0.72043011 0.65591398 0.63043478 0.73913043]
CV mean: 0.6868162692847125

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Links 
User features: Gender 
total features: 2034
Training Model Cost: 8 s.
[0.7745098  0.74836601 0.77777778 0.7442623  0.73770492]
CV mean: 0.756524161577199



Training Model Cost: 6 s.
[0.76470588 0.74836601 0.75490196 0.74754098 0.76393443]
CV mean: 0.7558898532090431

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Links 
User features: Religious_Ideology 
total features: 2164
Training Model Cost: 2 s.
[0.67741935 0.74193548 0.64516129 0.59782609 0.80434783]
CV mean: 0.6933380084151473

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Links 
User features: Religious_Ideology 
total features: 2164




Training Model Cost: 6 s.
[0.76470588 0.74836601 0.75490196 0.74754098 0.76393443]
CV mean: 0.7558898532090431

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Links 
User features: Gender Religious_Ideology 
total features: 2182
Training Model Cost: 1 s.
[0.67741935 0.74193548 0.64516129 0.59782609 0.80434783]
CV mean: 0.6933380084151473

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Links 
User features: Gender Religious_Ideology 
total features: 2182




Training Model Cost: 6 s.
[0.76470588 0.74836601 0.75490196 0.74754098 0.76393443]
CV mean: 0.7558898532090431

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: 
total features: 2018
Training Model Cost: 1 s.
[0.70967742 0.7311828  0.65591398 0.59782609 0.79347826]
CV mean: 0.6976157082748948

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: 
total features: 2018




Training Model Cost: 7 s.
[0.76470588 0.74509804 0.77777778 0.75737705 0.75081967]
CV mean: 0.7591556841315761

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: Gender 
total features: 2036
Training Model Cost: 2 s.
[0.72043011 0.72043011 0.65591398 0.60869565 0.77173913]
CV mean: 0.6954417952314166

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: Gender 
total features: 2036
Training Model Cost: 8 s.
[0.76143791 0.75490196 0.77124183 0.75081967 0.7442623 ]
CV mean: 0.756532733311904

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: Religious_Ideology 
total features: 2164
Training Model Cost: 2 s.
[0.72043011 0.72043011 0.65591398 0.60869565 0.77173913]
CV mean: 0.6954417952314166

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Questions Links 
User features: Religious_Ideology 
total features: 2164
Training Model Cost: 6 s.
[0.7614



Training Model Cost: 7 s.
[0.76470588 0.74836601 0.76143791 0.75409836 0.75737705]
CV mean: 0.7571970427515268

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Questions Links 
User features: Gender 
total features: 2038
Training Model Cost: 1 s.
[0.72043011 0.75268817 0.6344086  0.57608696 0.81521739]
CV mean: 0.6997662459093034

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Questions Links 
User features: Gender 
total features: 2038
Training Model Cost: 6 s.
[0.76143791 0.75490196 0.75490196 0.75409836 0.76065574]
CV mean: 0.757199185685203

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Questions Links 
User features: Religious_Ideology 
total features: 2166
Training Model Cost: 2 s.
[0.72043011 0.75268817 0.6344086  0.57608696 0.81521739]
CV mean: 0.6997662459093034

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Modals Questions Links 
User features: Religious_Ideology 
total features: 2166
Train