In [7]:
import argparse
import pandas as pd
import numpy as np
import time
import os
import matplotlib.pyplot as plt
from scipy.sparse.construct import vstack
from features import get_dataframe, update_text, update_ngrams, update_lexicon, upadate_linguistic, update_user, get_features, get_lable
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [8]:
parser = argparse.ArgumentParser()
parser.add_argument('--train', dest='train', required=False, default='data/train.jsonl',
                    help='Full path to the training file')
parser.add_argument('--test', dest='test', required=False, default='data/val.jsonl',
                    help='Full path to the evaluation file')
parser.add_argument('--user_data', dest='user_data', required=False, default='data/users.json',
                    help='Full path to the user data file')
parser.add_argument('--model', dest='model', required=False, default='Ngram+Lex+Ling+User',
                    choices=["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"],
                    help='The name of the model to train and evaluate.')
parser.add_argument('--lexicon_path', dest='lexicon_path', required=False, default='lexica/',
                    help='The full path to the directory containing the lexica.'
                            ' The last folder of this path should be "lexica".')
parser.add_argument('--outfile', dest='outfile', required=False, default='out.txt',
                    help='Full path to the file we will write the model predictions')
                    
args = parser.parse_args("")

In [9]:
folderpath = os.path.join('pickle')
if os.path.isdir(folderpath):
    pass
else:
    os.mkdir(folderpath)

df_train_loc = os.path.join(folderpath, 'df_train.pkl')
df_test_loc = os.path.join(folderpath, 'df_test.pkl')
df_user_loc = os.path.join(folderpath, 'df_user.pkl')


if os.path.isfile(df_train_loc) and os.path.isfile(df_test_loc):
    df_train = pd.read_pickle(df_train_loc)
    df_test = pd.read_pickle(df_test_loc)
    df_user = pd.read_pickle(df_user_loc)

else:

    start = time.time()
    
    df_train, df_test = get_dataframe(args.train, args.test)
    update_text(df_train, df_test)
    update_ngrams(df_train, df_test, feature_number=9800)
    update_lexicon(df_train, df_test, args.lexicon_path)
    upadate_linguistic(df_train, df_test)
    df_train, df_test, df_user = update_user(df_train, df_test, args.user_data)

    end = time.time()
    print("Data Preprocessiong Cost:", round(end - start),'s.')

    df_train.to_pickle(df_train_loc)
    df_test.to_pickle(df_test_loc)
    df_user.to_pickle(df_user_loc)


  df_CL = pd.read_csv(CL_csv, sep="_|,", names=['word', 'part', 'sentiment'])


Data Preprocessiong Cost: 93 s.


In [4]:
import itertools
def get_all_combinations(l : list, choose2=False, up22=False) -> list:
    ll = []
    for L in range(0, len(l)+1):
        for subset in itertools.combinations(l, L):
            if up22:
                if len(list(subset)) <= 2:
                    ll.append(list(subset))
            elif choose2:
                if len(list(subset)) == 2:
                    ll.append(list(subset))
            else:
                ll.append(list(subset))
    return ll

lexicons_list = ["CL", "NVL"]
ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
all_feature_list = list(lexicons_list+ling_feature_list+user_feature_list)

# Compare L2 vs N/A 5FCV => N/A is better

In [None]:
column_names = ["Model", "Norm","5F-CV Mean"]
df_record = pd.DataFrame(columns = column_names)
for md in ["Ngram", "Ngram+Lex", "Ngram+Lex+Ling", "Ngram+Lex+Ling+User"]:
    for nrom in [None, "l2"]:
        x_train, x_test = get_features(df_train, df_test, df_user, model = md, norm=nrom, )
        y_train, y_test = get_lable(df_train, df_test)
        x = vstack([x_train,x_test])
        y = y_train + y_test
        clf = LogisticRegression(solver='liblinear', max_iter=500)
        scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
        mean_score = np.mean(scores)
        record = {"Model": md, "Norm":nrom, "5F-CV Mean":mean_score}
        df_record = df_record.append(record,ignore_index=True)

df_record.to_csv(os.path.join('log/L2.csv'))

# Proformance test

In [20]:
for md in ["Ngram+Lex+Ling+User"]:
    print('====================', md,'====================')
    x_train, x_test = get_features(df_train, df_test, df_user, model = md)
    y_train, y_test = get_lable(df_train, df_test)
    print("Total Featurs:", x_train.shape[1])
    clf = LogisticRegression(solver='liblinear', max_iter=500)
    clf.fit(x_train, y_train)
    y_predicted = clf.predict(x_test)
    print("Accuracy score: ",accuracy_score(y_test, y_predicted))
    print("Accuracy score on Train: ",accuracy_score(y_train, clf.predict(x_train)))
    # plot_confusion_matrix(clf, x_test, y_test)
    # print(classification_report(y_test, y_predicted, target_names=['Con','Pro']))

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Total Featurs: 20060
Accuracy score:  0.7994987468671679
Accuracy score on Train:  0.8222361809045227


In [22]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']

for md in ["Ngram+Lex+Ling+User"]:
    for religion in [True, False]:
        
        if religion:
            print("====================","R",md, "====================")
            x_train, x_test = get_features(df_train_r, df_test_r,df_user, model = md)
            y_train, y_test = get_lable(df_train_r, df_test_r)
        else:
            print("===================", "NR",md, "====================")
            x_train, x_test = get_features(df_train_nr, df_test_nr,df_user, model = md)
            y_train, y_test = get_lable(df_train_nr, df_test_nr)


        x = vstack([x_train,x_test])
        y = y_train + y_test
        print(args.model)
        clf = LogisticRegression(solver='liblinear', max_iter=500)
        start = time.time()
        scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
        end = time.time()
        print("Training Model Cost:", round(end - start),'s.')
        print(scores)
        print(np.mean(scores))


Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Ngram+Lex+Ling+User
Training Model Cost: 6 s.
[0.70967742 0.75268817 0.59139785 0.61956522 0.7173913 ]
0.6781439925198691
Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Ngram+Lex+Ling+User
Training Model Cost: 21 s.
[0.77124183 0.77124183 0.74836601 0.75409836 0.8       ]
0.7689896067716704


In [19]:

for md in ["Ngram+Lex+Ling+User"]:
    print('====================', md,'====================')
    x_train, x_test = get_features(df_train, df_test, df_user, model = md)
    y_train, y_test = get_lable(df_train, df_test)
    x = vstack([x_train,x_test])
    y = y_train + y_test
    print(args.model)
    clf = LogisticRegression(solver='liblinear', max_iter=500)
    start = time.time()
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    end = time.time()
    print("Training Model Cost:", round(end - start),'s.')
    print(scores)
    print(np.mean(scores))

Lexicon used: Connotation NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Ngram+Lex+Ling+User
Training Model Cost: 29 s.
[0.7443609  0.77386935 0.75125628 0.75879397 0.79899497]
0.7654550950239921


# Find Best feature_number

In [None]:
column_names = ["Max Feature","5F-CV Mean", "scores"]
df_record = pd.DataFrame(columns = column_names)
for fn in range(9600, 9700, 50):
    print(fn)
    df_train = df_train.drop(columns=["Pro_ngram", "Con_ngram"])
    df_test = df_test.drop(columns=["Pro_ngram", "Con_ngram"])
    update_ngrams(df_train, df_test, feature_number=fn)
    x_train, x_test = get_features(df_train, df_test, model = "Ngram")
    y_train, y_test = get_lable(df_train, df_test)
    x = vstack([x_train,x_test])
    y = y_train + y_test
    clf = LogisticRegression(solver='liblinear')
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    mean_score = np.mean(scores)
    record = {"Max Feature":fn, "5F-CV Mean":mean_score, "scores":scores}
    df_record = df_record.append(record,ignore_index=True)

df_record.to_csv(os.path.join('log/Ngram_FN_4.csv'))

# Cross Valid All combination

In [None]:
column_names = ["Lex","Ling","User","5FCV Mean", "Scores"]
df_record = pd.DataFrame(columns = column_names)
for lex in get_all_combinations(lexicons_list):
    for ling in get_all_combinations(ling_feature_list, choose2=True):
        for user in get_all_combinations(user_feature_list, choose2=True):
            x_train, x_test = get_features(df_train, df_test, model = args.model,lex_list=lex, ling_list=ling, user_list=user)
            y_train, y_test = get_lable(df_train, df_test)
            x = vstack([x_train,x_test])
            y = y_train + y_test
            clf = LogisticRegression(solver='liblinear', max_iter=500)
            scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
            mean_score = np.mean(scores)
            record = {"Lex":lex,"Ling":ling,"User":user,"5FCV Mean":mean_score, "Scores":scores}
            df_record = df_record.append(record,ignore_index=True)

df_record.to_csv(os.path.join('Traversal.csv'))

# 3.1 Find Best Ling

In [10]:
lexicons_list = ["CL", "NVL"]
ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
all_feature_list = list(lexicons_list+ling_feature_list+user_feature_list)


column_names = ["Lex","Ling","5FCV Mean"]
df_record = pd.DataFrame(columns = column_names)
for lex in get_all_combinations(lexicons_list):
    for ling in get_all_combinations(ling_feature_list, up22=True):
        x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex+Ling",lex_list=lex, ling_list=ling, user_list=[])
        y_train, y_test = get_lable(df_train, df_test)
        x = vstack([x_train,x_test])
        y = y_train + y_test
        clf = LogisticRegression(solver='liblinear', max_iter=500)
        scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
        mean_score = np.mean(scores)
        record = {"Lex":lex,"Ling":ling,"5FCV Mean":mean_score}
        df_record = df_record.append(record,ignore_index=True)

        df_record.to_csv(os.path.join('log/3_1.csv'))

Lexicon used: 
Linguistic features: 
Lexicon used: 
Linguistic features: Length 
Lexicon used: 
Linguistic features: R2O 
Lexicon used: 
Linguistic features: Personal_pronouns 
Lexicon used: 
Linguistic features: Modals 
Lexicon used: 
Linguistic features: Links 
Lexicon used: 
Linguistic features: Questions 
Lexicon used: 
Linguistic features: Length R2O 
Lexicon used: 
Linguistic features: Length Personal_pronouns 
Lexicon used: 
Linguistic features: Length Modals 
Lexicon used: 
Linguistic features: Length Links 
Lexicon used: 
Linguistic features: Length Questions 
Lexicon used: 
Linguistic features: R2O Personal_pronouns 
Lexicon used: 
Linguistic features: R2O Modals 
Lexicon used: 
Linguistic features: R2O Links 
Lexicon used: 
Linguistic features: R2O Questions 
Lexicon used: 
Linguistic features: Personal_pronouns Modals 
Lexicon used: 
Linguistic features: Personal_pronouns Links 
Lexicon used: 
Linguistic features: Personal_pronouns Questions 
Lexicon used: 
Linguistic featu

# 3.1 Examples ['Length', 'Links']

In [34]:
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex+Ling",lex_list=["NVL"], ling_list=['Length', 'Links'], user_list=[])
y_train, y_test = get_lable(df_train, df_test)
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P_with_ling = clf.predict(x_test)
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex",lex_list=["NVL"], ling_list=[], user_list=[])
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P = clf.predict(x_test)

winner_length = 0
loser_length = 0
winner_links = 0
loser_links = 0
for i in range(len(y_test)):
    if P_with_ling[i] == y_test[i] and P_with_ling[i] != P[i]:
        print("Debate:", i)
        report = df_test.loc[i,["winner", "Pro_Links","Con_Links","Pro_Length", "Con_Length"]].to_dict()
        print(report)
        if y_test[i]:
            winner_length += report["Pro_Length"]
            winner_links += report["Pro_Links"]
            loser_length += report["Con_Length"]
            loser_links += report["Con_Links"]

        else:
            winner_length += report["Con_Length"]
            winner_links += report["Con_Links"]
            loser_length += report["Pro_Length"]
            loser_links += report["Pro_Links"]

print("winner_length:", winner_length)
print("loser_length", loser_length)
print("winner_links", winner_links)
print("loser_links", loser_links)



Lexicon used: NRC-VAD 
Linguistic features: Length Links 
Lexicon used: NRC-VAD 
Debate: 76
{'winner': 'Con', 'Pro_Links': 0, 'Con_Links': 4, 'Pro_Length': 9300, 'Con_Length': 7957}
Debate: 104
{'winner': 'Con', 'Pro_Links': 0, 'Con_Links': 0, 'Pro_Length': 2035, 'Con_Length': 1448}
Debate: 116
{'winner': 'Pro', 'Pro_Links': 4, 'Con_Links': 0, 'Pro_Length': 7406, 'Con_Length': 6700}
Debate: 132
{'winner': 'Con', 'Pro_Links': 0, 'Con_Links': 0, 'Pro_Length': 1694, 'Con_Length': 2302}
Debate: 149
{'winner': 'Con', 'Pro_Links': 0, 'Con_Links': 0, 'Pro_Length': 1455, 'Con_Length': 1622}
Debate: 166
{'winner': 'Pro', 'Pro_Links': 1, 'Con_Links': 0, 'Pro_Length': 7117, 'Con_Length': 5559}
Debate: 171
{'winner': 'Con', 'Pro_Links': 4, 'Con_Links': 9, 'Pro_Length': 14002, 'Con_Length': 20314}
Debate: 181
{'winner': 'Con', 'Pro_Links': 6, 'Con_Links': 5, 'Pro_Length': 9353, 'Con_Length': 10972}
Debate: 189
{'winner': 'Con', 'Pro_Links': 6, 'Con_Links': 2, 'Pro_Length': 3764, 'Con_Length': 3281}

# 4_2

In [39]:
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex",lex_list=["NVL"], ling_list=[], user_list=[])
y_train, y_test = get_lable(df_train, df_test)
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P_with_ling = clf.predict(x_test)
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex",lex_list=["CL"], ling_list=[], user_list=[])
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P = clf.predict(x_test)

winner_positive = []
winner_neutral = []

for i in range(len(y_test)):
    if P_with_ling[i] == y_test[i] and P_with_ling[i] != P[i]:
        print("Debate:", i)
        report = df_test.loc[i,["winner", 'Pro_positive', 'Con_positive', 'Pro_neutral',
       'Con_neutral', 'Pro_negative', 'Con_negative', 'Pro_a-score',
       'Pro_d-score', 'Pro_v-score', 'Con_a-score', 'Con_d-score', 'Con_v-score']]
        print(report)
        if y_test[i]:


        else:



# print("winner_length:", winner_length)
# print("loser_length", loser_length)
# print("winner_links", winner_links)
# print("loser_links", loser_links)

Lexicon used: NRC-VAD 
Lexicon used: Connotation 
Debate: 10
winner              Pro
Pro_positive        253
Con_positive        274
Pro_neutral         321
Con_neutral         358
Pro_negative        221
Con_negative        261
Pro_a-score     964.553
Pro_d-score     874.713
Pro_v-score     1036.13
Con_a-score     940.858
Con_d-score     774.323
Con_v-score     967.065
Name: 10, dtype: object
Debate: 14
winner              Con
Pro_positive        231
Con_positive        202
Pro_neutral         299
Con_neutral         271
Pro_negative        225
Con_negative        178
Pro_a-score     406.614
Pro_d-score     353.004
Pro_v-score     423.571
Con_a-score     368.068
Con_d-score     324.231
Con_v-score     403.071
Name: 14, dtype: object
Debate: 20
winner              Pro
Pro_positive        252
Con_positive        256
Pro_neutral         308
Con_neutral         313
Pro_negative        230
Con_negative        229
Pro_a-score     573.923
Pro_d-score     446.834
Pro_v-score     555.914
Con_a

In [48]:
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex+Ling+User",lex_list=["NVL"], ling_list=['Links', 'Questions'], user_list=['education', 'party'])
y_train, y_test = get_lable(df_train, df_test)
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P_with_ling = clf.predict(x_test)
x_train, x_test = get_features(df_train, df_test, df_user, model = "Ngram+Lex+Ling",lex_list=["NVL"], ling_list=['Links', 'Questions'], user_list=[])
clf = LogisticRegression(solver='liblinear', max_iter=500)
clf.fit(x_train, y_train)
P = clf.predict(x_test)

print("Accuracy score",accuracy_score(y_test, P))
print("Accuracy score with user",accuracy_score(y_test, P_with_ling))


for i in range(len(y_test)):
    if P_with_ling[i] == y_test[i] and P_with_ling[i] != P[i]:
        print("Debate:", i)
        report = df_test.loc[i,["winner", 'Pro_education', 'Con_education', 'Pro_party','Con_party']]
        print(report)
        # if y_test[i]:


Lexicon used: NRC-VAD 
Linguistic features: Links Questions 
User features: education party 
Lexicon used: NRC-VAD 
Linguistic features: Links Questions 
Accuracy score 0.7593984962406015
Accuracy score with user 0.7919799498746867
Debate: 4
winner                       Pro
Pro_education    Graduate Degree
Con_education    Graduate Degree
Pro_party              Undecided
Con_party            Independent
Name: 4, dtype: object
Debate: 15
winner                    Pro
Pro_education    Some College
Con_education      Not Saying
Pro_party               Other
Con_party          Not Saying
Name: 15, dtype: object
Debate: 76
winner                          Con
Pro_education            Not Saying
Con_education           High School
Pro_party                Not Saying
Con_party        Independence Party
Name: 76, dtype: object
Debate: 79
winner                       Pro
Pro_education    Graduate Degree
Con_education         Not Saying
Pro_party            Independent
Con_party             Not S

In [25]:
df_test[["winner", "Pro_Links","Con_Links","Pro_Length", "Con_Length"]].iloc[76, :].values[0]

'Con'

# Ablation

In [None]:
column_names = ["Ablation Feature","5FCV mean"]
df_record = pd.DataFrame(columns = column_names)

for feature in all_feature_list:

    lexicons_list = ["CL", "NVL"]
    ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
    user_feature_list = ['education', 'ethnicity','gender','income','joined','party','politi|cal_ideology','relationship','religious_ideology']
        
    if feature in ling_feature_list:
        ling_feature_list.remove(feature)
    if feature in user_feature_list:
        user_feature_list.remove(feature)
    if feature in lexicons_list:
        lexicons_list.remove(feature)

    x_train, x_test = get_features(df_train, df_test, df_user, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
    y_train, y_test = get_lable(df_train, df_test)

    x = vstack([x_train,x_test])
    y = y_train + y_test
    clf = LogisticRegression(solver='liblinear', max_iter=500)
    scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
    mean_score = np.mean(scores)
    record = {"Ablation Feature":feature ,"5FCV mean": mean_score}
    df_record = df_record.append(record,ignore_index=True)


df_record.to_csv(os.path.join('log/Ablation.csv'))

# Ablation R vs O

In [50]:
df_train_r = df_train[df_train['category']=='Religion']
df_train_nr = df_train[df_train['category']!='Religion']
df_test_r = df_test[df_test['category']=='Religion']
df_test_nr = df_test[df_test['category']!='Religion']

column_names = ["Religion","Ablation Feature","5FCV Mean"]
df_record = pd.DataFrame(columns = column_names)


for feature in all_feature_list:

    ling_feature_list = ['Length', 'R2O', 'Personal_pronouns', 'Modals', 'Links', 'Questions']
    user_feature_list = ['education','ethnicity', 'gender', 'income', 'joined', 'party', 'political_ideology', 'relationship', 'religious_ideology']
    lexicons_list = ["CL", "NVL"]
        
    if feature in ling_feature_list:
        ling_feature_list.remove(feature)
    if feature in user_feature_list:
        user_feature_list.remove(feature)
    if feature in lexicons_list:
        lexicons_list.remove(feature)

    for religion in [True, False]:

        if religion:
            x_train, x_test = get_features(df_train_r, df_test_r, df_user, norm=None, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
            y_train, y_test = get_lable(df_train_r, df_test_r)
        else:
            x_train, x_test = get_features(df_train_nr, df_test_nr,df_user, norm=None, model = args.model,lex_list=lexicons_list, ling_list=ling_feature_list, user_list=user_feature_list)
            y_train, y_test = get_lable(df_train_nr, df_test_nr)

        x = vstack([x_train,x_test])
        y = y_train + y_test
        clf = LogisticRegression(solver='liblinear', max_iter=500)
        start = time.time()
        scores = cross_val_score(clf, x, y, cv=5 ,scoring='accuracy')
        end = time.time()
        mean_score = np.mean(scores)
        record = {"Religion":religion, "Ablation Feature":feature ,"5FCV Mean":mean_score}
        df_record = df_record.append(record,ignore_index=True)


df_record.to_csv(os.path.join('log/Ablation_ReligionVsOther.csv'))

Lexicon used: NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: NRC-VAD 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation 
Linguistic features: Length R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender income joined party political_ideology relationship religious_ideology 
Lexicon used: Connotation NRC-VAD 
Linguistic features: R2O Personal_pronouns Modals Links Questions 
User features: education ethnicity gender inco