In [1]:
import argparse
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from scipy.sparse.construct import vstack
from features import get_dataframe, update_text, update_ngrams, update_lexicon, upadate_linguistic, update_user, get_features, get_lable
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', required=False, default='data/train.jsonl',
                    help='Full path to the training file')
parser.add_argument('--test', dest='test', required=False, default='data/val.jsonl',
                    help='Full path to the evaluation file')
parser.add_argument('--user_data', dest='user_data', required=False, default='data/users.json',
                    help='Full path to the user data file')
parser.add_argument('--model', dest='model', required=False, default='Ngram+Lex+Ling+User',
                    choices=["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"],
                    help='The name of the model to train and evaluate.')
parser.add_argument('--lexicon_path', dest='lexicon_path', required=False, default='lexica/',
                    help='The full path to the directory containing the lexica.'
                            ' The last folder of this path should be "lexica".')
parser.add_argument('--outfile', dest='outfile', required=False, default='out.txt',
                    help='Full path to the file we will write the model predictions')
                    
args = parser.parse_args("")

In [3]:
df_train_loc = os.path.join('df_train.pkl')
df_test_loc = os.path.join('df_test.pkl')


if os.path.isfile(df_train_loc) and os.path.isfile(df_test_loc):
    df_train = pd.read_pickle(df_train_loc)
    df_test = pd.read_pickle(df_test_loc)

else:

    start = time.time()

    df_train, df_test = get_dataframe(args.train, args.test)
    update_text(df_train, df_test)
    update_ngrams(df_train, df_test, feature_number=1000)
    update_lexicon(df_train, df_test, args.lexicon_path)
    upadate_linguistic(df_train, df_test)
    update_user(df_train, df_test, args.user_data)

    end = time.time()
    print("Data Preprocessiong Cost:", round(end - start),'s.')

    df_train.to_pickle(df_train_loc)
    df_test.to_pickle(df_test_loc)

In [4]:
import itertools
def get_all_combinations(l : list, choose2=False) -> list:
    ll = []
    for L in range(0, len(l)+1):
        for subset in itertools.combinations(l, L):
            if choose2:
                if len(list(subset)) == 2:
                    ll.append(list(subset))
            else:
                ll.append(list(subset))
    return ll

lexicons_list = ["CL", "NVL"]
ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
all_feature_list = list(lexicons_list+ling_feature_list+user_feature_list)

# Find Best feature_number

In [None]:
column_names = ["Max Feature","5F-CV Mean", "scores"]
df_record = pd.DataFrame(columns = column_names)
for fn in range(9600, 9700, 50):
    print(fn)
    df_train = df_train.drop(columns=["Pro_ngram", "Con_ngram"])
    df_test = df_test.drop(columns=["Pro_ngram", "Con_ngram"])
    update_ngrams(df_train, df_test, feature_number=fn)
    x_train, x_test = get_features(df_train, df_test, model = "Ngram")
    y_train, y_test = get_lable(df_train, df_test)
    x = vstack([x_train,x_test])
    y = y_train + y_test
    clf = LogisticRegression(solver='liblinear')
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    mean_score = np.mean(scores)
    record = {"Max Feature":fn, "5F-CV Mean":mean_score, "scores":scores}
    df_record = df_record.append(record,ignore_index=True)

df_record.to_csv(os.path.join('log/Ngram_FN_4.csv'))

# Cross Valid All combination

In [None]:
column_names = ["Lex","Ling","User","5FCV Mean", "Scores"]
df_record = pd.DataFrame(columns = column_names)
for lex in get_all_combinations(lexicons_list):
    for ling in get_all_combinations(ling_feature_list, choose2=True):
        for user in get_all_combinations(user_feature_list, choose2=True):
            x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
            y_train, y_test = get_lable(df_train, df_test)
            x = vstack([x_train,x_test])
            y = y_train + y_test
            clf = LogisticRegression(solver='liblinear', max_iter=500)
            scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
            mean_score = np.mean(scores)
            record = {"Lex":lex,"Ling":ling,"User":user,"5FCV Mean":mean_score, "Scores":scores}
            df_record = df_record.append(record,ignore_index=True)

df_record.to_csv(os.path.join('Traversal.csv'))

# Ablation

In [None]:
column_names = ["target_feature","mean_score", "scores"]
df_record = pd.DataFrame(columns = column_names)

for feature in all_feature_list:

    ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
    user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
    lexicons_list = ["CL", "NVL"]
        
    if feature in ling_feature_list:
        ling_feature_list.remove(feature)
    if feature in user_feature_list:
        user_feature_list.remove(feature)
    if feature in lexicons_list:
        lexicons_list.remove(feature)

    x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
    y_train, y_test = get_lable(df_train, df_test)

    x = vstack([x_train,x_test])
    y = y_train + y_test
    clf = LogisticRegression(solver='liblinear', max_iter=500)
    start = time.time()
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    end = time.time()
    mean_score = np.mean(scores)
    record = {"target_feature":feature, "mean_score":mean_score, "scores":scores}
    df_record = df_record.append(record,ignore_index=True)


df_record.to_csv(os.path.join('Ablation.csv'))

# Ablation R vs O

In [5]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']

column_names = ["Religion","Ablation Feature","5FCV Mean", "Scores", "Norm"]
df_record = pd.DataFrame(columns = column_names)


for feature in all_feature_list:

    ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
    user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
    lexicons_list = ["CL", "NVL"]
        
    if feature in ling_feature_list:
        ling_feature_list.remove(feature)
    if feature in user_feature_list:
        user_feature_list.remove(feature)
    if feature in lexicons_list:
        lexicons_list.remove(feature)

    for religion in [True, False]:

        if religion:
            x_train, x_test = get_features(df_train_r, df_test_r, norm=None, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
            y_train, y_test = get_lable(df_train_r, df_test_r)
        else:
            x_train, x_test = get_features(df_train_nr, df_test_nr, norm=None, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
            y_train, y_test = get_lable(df_train_nr, df_test_nr)

        x = vstack([x_train,x_test])
        y = y_train + y_test
        clf = LogisticRegression(solver='liblinear', max_iter=500)
        start = time.time()
        scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
        end = time.time()
        mean_score = np.mean(scores)
        record = {"Religion":religion, "Ablation Feature":feature ,"5FCV Mean":mean_score, "Scores":scores, "Norm":norm}
        df_record = df_record.append(record,ignore_index=True)


df_record.to_csv(os.path.join('Ablation_ReligionVsOther_Normalize.csv'))

Lexicon used: NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation NRC-VAD 
Linguistic features: R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender inco