In [None]:
import argparse
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from scipy.sparse.construct import vstack
from features import get_dataframe, update_text, update_ngrams, update_lexicon, upadate_linguistic, update_user, get_features, get_lable
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', required=False, default='data/train.jsonl',
                    help='Full path to the training file')
parser.add_argument('--test', dest='test', required=False, default='data/val.jsonl',
                    help='Full path to the evaluation file')
parser.add_argument('--user_data', dest='user_data', required=False, default='data/users.json',
                    help='Full path to the user data file')
parser.add_argument('--model', dest='model', required=False, default='Ngram+Lex+Ling+User',
                    choices=["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"],
                    help='The name of the model to train and evaluate.')
parser.add_argument('--lexicon_path', dest='lexicon_path', required=False, default='lexica/',
                    help='The full path to the directory containing the lexica.'
                            ' The last folder of this path should be "lexica".')
parser.add_argument('--outfile', dest='outfile', required=False, default='out.txt',
                    help='Full path to the file we will write the model predictions')
                    
args = parser.parse_args("")

In [None]:
df_train_loc = os.path.join('df_train.pkl')
df_test_loc = os.path.join('df_test.pkl')


if os.path.isfile(df_train_loc) and os.path.isfile(df_test_loc):
    df_train = pd.read_pickle(df_train_loc)
    df_test = pd.read_pickle(df_test_loc)

else:

    start = time.time()

    df_train, df_test = get_dataframe(args.train, args.test)
    update_text(df_train, df_test)
    update_ngrams(df_train, df_test)
    update_lexicon(df_train, df_test, args.lexicon_path)
    upadate_linguistic(df_train, df_test)
    update_user(df_train, df_test, args.user_data)

    end = time.time()
    print("Data Preprocessiong Cost:", round(end - start),'s.')

    df_train.to_pickle(df_train_loc)
    df_test.to_pickle(df_test_loc)

In [None]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']


In [None]:
print(len(df_train_r), len(df_train_nr), len(df_test_r), len(df_test_nr))

In [None]:
import itertools
def get_all_combinations(l : list) -> list:
    ll = []
    for L in range(0, len(l)+1):
        for subset in itertools.combinations(l, L):
            ll.append(list(subset))
    return ll


In [None]:
column_names = ["Religion", "lex_list", "ling_list", "user_list", "scores", "mean_score"]
df_record = pd.DataFrame(columns = column_names)

# triversal all possible
for lex_list in get_all_combinations(["CL", "NVL"]):
    for ling_list in get_all_combinations(["Length","Modals","Questions", "Links"]):
        for user_list in get_all_combinations(["Gender", "RI"]):
            religion = True

            for df_train, df_test in [[df_train_r, df_test_r], [df_train_nr, df_test_nr]]:

                if religion:
                    print("\n==================Religion==================")
                else:
                    print("\n==================Non-Religion==================")

                x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lex_list, ling_list=ling_list, user_list=user_list)


                y_train, y_test = get_lable(df_train, df_test)
                print('total features:', x_train.shape[1])

                x = vstack([x_train,x_test])
                y = y_train + y_test
                clf = LogisticRegression(solver='liblinear')
                start = time.time()
                scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
                end = time.time()
                print("Training Model Cost:", round(end - start),'s.')
                print(scores)
                mean_score = np.mean(scores)
                print("CV mean:", mean_score)

                record = {"Religion":religion, "lex_list":lex_list, "ling_list":ling_list, "user_list":user_list, "scores":scores, "mean_score":mean_score}
                df_record = df_record.append(record,ignore_index=True)

                religion = False

df_record.to_csv(os.path.join('df_record.csv'))
print("Wrote record to df_record.csv.")