In [7]:
import argparse
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from scipy.sparse.construct import vstack
from features import get_dataframe, update_text, update_ngrams, update_lexicon, upadate_linguistic, update_user, get_features, get_lable
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', required=False, default='data/train.jsonl',
                    help='Full path to the training file')
parser.add_argument('--test', dest='test', required=False, default='data/val.jsonl',
                    help='Full path to the evaluation file')
parser.add_argument('--user_data', dest='user_data', required=False, default='data/users.json',
                    help='Full path to the user data file')
parser.add_argument('--model', dest='model', required=False, default='Ngram+Lex+Ling+User',
                    choices=["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"],
                    help='The name of the model to train and evaluate.')
parser.add_argument('--lexicon_path', dest='lexicon_path', required=False, default='lexica/',
                    help='The full path to the directory containing the lexica.'
                            ' The last folder of this path should be "lexica".')
parser.add_argument('--outfile', dest='outfile', required=False, default='out.txt',
                    help='Full path to the file we will write the model predictions')
                    
args = parser.parse_args("")

In [None]:
df_train_loc = os.path.join('df_train.pkl')
df_test_loc = os.path.join('df_test.pkl')


if os.path.isfile(df_train_loc) and os.path.isfile(df_test_loc):
    df_train = pd.read_pickle(df_train_loc)
    df_test = pd.read_pickle(df_test_loc)

else:

    start = time.time()

    df_train, df_test = get_dataframe(args.train, args.test)
    update_text(df_train, df_test)
    update_ngrams(df_train, df_test, feature_number=25000)
    update_lexicon(df_train, df_test, args.lexicon_path)
    upadate_linguistic(df_train, df_test)
    update_user(df_train, df_test, args.user_data)

    end = time.time()
    print("Data Preprocessiong Cost:", round(end - start),'s.')

    df_train.to_pickle(df_train_loc)
    df_test.to_pickle(df_test_loc)

In [None]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']
print(len(df_train_r)+len(df_test_r), len(df_train_nr)+len(df_test_nr))

463 1528


In [None]:
import itertools
def get_all_combinations(l : list, choose2=False) -> list:
    ll = []
    for L in range(0, len(l)+1):
        for subset in itertools.combinations(l, L):
            if choose2:
                if len(list(subset)) == 2:
                    ll.append(list(subset))
            else:
                ll.append(list(subset))
    return ll

ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
lexicons_list = ["CL", "NVL"]
all_feature_list = list(lexicons_list+ling_feature_list+user_feature_list)

In [6]:
column_names = ["target_feature","mean_score", "scores"]
df_record = pd.DataFrame(columns = column_names)


for feature in all_feature_list:

    ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
    user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
    lexicons_list = ["CL", "NVL"]
        
    if feature in ling_feature_list:
        ling_feature_list.remove(feature)
    if feature in user_feature_list:
        user_feature_list.remove(feature)
    if feature in lexicons_list:
        lexicons_list.remove(feature)

    x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
    y_train, y_test = get_lable(df_train, df_test)
    # print('total features:', x_train.shape[1])

    x = vstack([x_train,x_test])
    y = y_train + y_test
    clf = LogisticRegression(solver='liblinear')
    start = time.time()
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    end = time.time()
    # print("Training Model Cost:", round(end - start),'s.')
    # print(scores)
    mean_score = np.mean(scores)
    # print("CV mean:", mean_score)

    record = {"target_feature":feature, "mean_score":mean_score, "scores":scores}
    df_record = df_record.append(record,ignore_index=True)


df_record.to_csv(os.path.join('Ablation.csv'))
print("Wrote record to df_record.csv.")

Lexicon used: NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 




Lexicon used: Connotation 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation NRC-VAD 
Linguistic features: R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation NRC-VAD 
Linguistic features: Length Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 




Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 




Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 




Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 




Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 


