In [1]:
import collections
from numpy.lib.function_base import vectorize
import pandas as pd 
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import json
from scipy import sparse
from scipy.spatial import distance
import sklearn.metrics
import sklearn.neighbors
import sklearn.linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from better_profanity import profanity
import time
import os
import itertools

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [133]:
# loading user  data 

# loading training data .jsonl
TRAINING_DATA = './resources/data/train.jsonl'
VAL_DATA = './resources/data/val.jsonl'

df_train, df_val = pd.read_json(TRAINING_DATA, lines=True), pd.read_json(VAL_DATA, lines=True)
USER_DATA = './resources/data/users.json'
df_user = pd.read_json(USER_DATA, orient="index")

In [135]:
print(sum(df_val.category == 'Religion'))
print(sum(df_val.category != 'Religion'))
print(df_val.shape)

93
306
(399, 9)


In [87]:
def get_text_by_side(df): 
    '''
    Return a list of documents where each document contains all text on one side in a 
    single debate
    
    text = [[Pro statement 1, Pro statement 2, ... Pro statement n],
            [Con statement 1, Con statement 2, ... Con statement m]]
            where n, m is the total number of statements from Pro and Con side across
            all debates

    size: [n x 2 x # statements in each debate]
    '''

    text = []
    for round in df.loc[:, 'rounds']:
        round_text = collections.defaultdict(list)

        for sub_round in round:
            for speech in sub_round: 
                round_text[speech['side']].append(speech['text'])

        
        text.append(["".join(round_text['Pro']), "".join(round_text['Con'])])

    return np.array(text)
    
countrape = []
for doc in get_text_by_side(df_train):
    countrape.append([doc[0].count('stupid'), doc[1].count('stupid')])
    

In [95]:
df = pd.concat([pd.DataFrame(countrape), df_train], axis=1).sort_values(by=[0, 1], ascending=False).head(10)
df

Unnamed: 0,0,1,id,category,title,rounds,date,pro_debater,con_debater,voters,winner
603,19,24,The-debate.org-site-rules-should-be-more-stric...,Miscellaneous,The debate.org site rules should be more stric...,"[[{'side': 'Pro', 'text': '  The debate.or...",2014-01-20,RoyLatham,imabench,"[Juris_Naturalis, PeriodicPatriot, bsh1, Garre...",Pro
980,5,4,Legend-of-the-Seeker-stays-true-to-The-Sword-o...,Entertainment,Legend of the Seeker stays true to The Sword o...,"[[{'side': 'Pro', 'text': '  The Legend of ...",2009-10-29,Seekeroftruth469,Ragnar_Rahl,"[aoibhinn, Marauder, Xer, wonderwoman, Seekero...",Con
331,4,1,The-10-commandments-are-a-JOKE/1/,Religion,The 10 commandments are a JOKE!,"[[{'side': 'Pro', 'text': '  Thankfully th...",2017-08-20,backwardseden,BrianCBiggs,"[Khons, DNehlsen]",Con
340,4,1,god-is-not-one-therefore-god-is-a-fraud/1/,Religion,"god is not ""one"" therefore god is a fraud","[[{'side': 'Pro', 'text': '  Genesis 1:26 ...",2017-09-30,backwardseden,wmickas,"[DNehlsen, dsjpk5]",Con
1076,4,0,Tell-me-your-thoughts-on-Abortion./1/,Society,Tell me your thoughts on Abortion.,"[[{'side': 'Con', 'text': '  I feel that ab...",2007-12-18,Kasrahalteth,tjzimmer,"[Mharman, indianajones644, adamh, griffinisrig...",Con
1118,4,0,Abortion/53/,Society,Abortion,"[[{'side': 'Con', 'text': '  I, con, will a...",2010-01-29,tkubok,TysonMarshall,"[wiseovvl, simplymara, Rasliel, cmahdavi, Deaf...",Pro
982,3,1,Term-Lengths-and-Limitations/1/,Politics,Term Lengths and Limitations,"[[{'side': 'Pro', 'text': '  I thank my opp...",2009-01-18,crackofdawn_Jr,JBlake,"[crackofdawn_Jr, JBlake, RoyLatham]",Con
1207,3,1,macroevolution-of-humans-has-stopped-for-the-f...,Science,macroevolution of humans has stopped for the f...,"[[{'side': 'Pro', 'text': '  macroevolution...",2014-07-29,linate,FMAlchemist,"[gt4o2007, SamStevens, Sagey, birdlandmemories]",Con
1223,3,1,Racists-are-ignorant-and-are-purely-stupid./1/,Society,Racist's are ignorant and are purely stupid.,"[[{'side': 'Pro', 'text': '  Racists have n...",2010-03-27,wells5674,Koopin,"[wonderwoman, twin, Marauder, belle, Koopin]",Con
984,2,6,Is-Google-Making-Us-Stupid/1/,Education,Is Google Making Us Stupid?,"[[{'side': 'Pro', 'text': '  Hi this was an...",2009-05-24,luxx,alto2osu,"[thisoneguy, alto2osu, philosphical, Maikuru, ...",Con


In [101]:
print(df.drop(columns=['voters', 'rounds', 'id', 'date', 'pro_debater', 'con_debater', 'category']).to_latex())

\begin{tabular}{lrrll}
\toprule
{} &   0 &   1 &                                              title & winner \\
\midrule
603  &  19 &  24 &  The debate.org site rules should be more stric... &    Pro \\
980  &   5 &   4 &  Legend of the Seeker stays true to The Sword o... &    Con \\
331  &   4 &   1 &                    The 10 commandments are a JOKE! &    Con \\
340  &   4 &   1 &          god is not "one" therefore god is a fraud &    Con \\
1076 &   4 &   0 &                 Tell me your thoughts on Abortion. &    Con \\
1118 &   4 &   0 &                                           Abortion &    Pro \\
982  &   3 &   1 &                       Term Lengths and Limitations &    Con \\
1207 &   3 &   1 &  macroevolution of humans has stopped for the f... &    Con \\
1223 &   3 &   1 &       Racist's are ignorant and are purely stupid. &    Con \\
984  &   2 &   6 &                        Is Google Making Us Stupid? &    Con \\
\bottomrule
\end{tabular}



In [31]:
class Transformer_get_political_align(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.category = list(set(df_user.political_ideology))
        self.religous_type = CategoricalDtype(self.category, ordered=True)
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, df, y=None):
        df = df
        voters_political = df.loc[:, 'voters'].apply(self.get_political_voters)
        
        pro_political = df.loc[:, 'pro_debater'].apply(self.get_political)
        pro_political_align = pd.DataFrame({'user_political': pro_political,
                                           'voters_political': voters_political}).apply(self.get_match_political, 
                                                                                       axis=1)
        
        con_political = df.loc[:, 'con_debater'].apply(self.get_political)
        con_political_align = pd.DataFrame({'user_political': con_political,
                                           'voters_political': voters_political}).apply(self.get_match_political,
                                                                                       axis=1)
        
        return np.hstack([np.vstack(pro_political_align.values), np.vstack(con_political_align.values)])
        
        
    def get_political(self, user): 
        user_political = df_user.loc[user, "political_ideology"]
        return user_political

    def get_political_voters(self, voters): 
        political_vectors = []
        voters = np.array(voters)
        eligible_voters = voters[list(map(lambda voter: voter in df_user.index, voters))]
        if len(eligible_voters) > 0:
            data = np.array(list(map(self.get_political, eligible_voters)))
        else:
            data = np.nan
        return data

    def get_match_political(self, row): 
        user_political, voters_political = row["user_political"], row["voters_political"]

        if voters_political is np.nan:
            return [0, 0, 0]

        if user_political == 'Not Saying':
            return np.array([0, 1, 0])

        feature = np.array([0, 0, 0])
        for v_r in voters_political:
            if v_r == 'Not Saying' :
                feature += np.array([0, 1, 0])
            elif v_r == user_political:
                feature += np.array([0, 0, 1])
            else:
                feature += np.array([1, 0, 0])

        return feature / np.sum(feature)

trans = Transformer_get_political_align()
feat = trans.transform(df_train)
print(feat)

[[1.         0.         0.         0.75       0.         0.25      ]
 [0.85714286 0.14285714 0.         0.71428571 0.14285714 0.14285714]
 [0.75       0.0625     0.1875     0.75       0.0625     0.1875    ]
 ...
 [1.         0.         0.         0.66666667 0.         0.33333333]
 [0.66666667 0.22222222 0.11111111 0.44444444 0.22222222 0.33333333]
 [1.         0.         0.         0.33333333 0.         0.66666667]]


In [58]:
df_user.loc['Zealotical']

big_issues_dict       {'Abortion': 'Con', 'Affirmative Action': 'N/S...
birthday                                                    - Private -
education                                                    Not Saying
ethnicity                                                         White
gender                                                             Male
friends               [Chrysippus, Deathgodxiii, Ruperttheg, grayron...
income                                                       Not Saying
joined                                                      9 Years Ago
opinion_arguments                                                    []
opinion_questions                                                    []
party                                                             Other
political_ideology                                                Other
poll_topics                                                          []
poll_votes                                                      

In [56]:
df = pd.concat([pd.DataFrame(feat), df_train], axis=1)
df.sort_values(by=[2], ascending=False)

Unnamed: 0,0,1,2,3,4,5,id,category,title,rounds,date,pro_debater,con_debater,voters,winner
287,0.000000,0.000000,1.0,0.000000,1.000000,0.000000,The-existence-of-free-will-as-proposed-by-the-...,Religion,The existence of free will as proposed by the ...,"[[{'side': 'Con', 'text': '  I challenge an...",2011-06-14,KeytarHero,GOD-vs-ITSELF,"[ReformedArsenal, medic0506, Cliff.Stamp]",Pro
498,0.000000,0.000000,1.0,1.000000,0.000000,0.000000,Medical-Marijuana/3/,Health,Medical Marijuana,"[[{'side': 'Con', 'text': '  ======== In...",2010-11-21,Atheism,Zealotical,"[Atheism, MarcL]",Pro
1288,0.000000,0.000000,1.0,1.000000,0.000000,0.000000,Lighting-Impromptu-Debate/2/,Miscellaneous,Lighting Impromptu Debate,"[[{'side': 'Pro', 'text': '  This is a ligh...",2010-08-09,Strikeeagle84015,Grape,"[I-am-a-panda, Strikeeagle84015]",Con
1065,0.000000,0.000000,1.0,1.000000,0.000000,0.000000,Atheism-can-not-account-for-moral-truth/1/,Philosophy,Atheism can not account for moral truth,"[[{'side': 'Pro', 'text': '  To start, I wa...",2011-06-23,charles15,LeoL,"[Cliff.Stamp, GMDebater, RoyLatham]",Con
265,0.000000,0.000000,1.0,0.000000,0.000000,1.000000,The-Bible-doesnt-forbid-Tattoos/1/,Religion,The Bible doesn't forbid Tattoos,"[[{'side': 'Pro', 'text': '  The argument i...",2013-08-02,dwmiller,wolfman4711,"[GOP, Chapule]",Pro
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
651,0.000000,1.000000,0.0,0.666667,0.166667,0.166667,Standardized-Exit-Exams-Lincoln-Douglas-Format/1/,Education,Standardized Exit Exams: Lincoln-Douglas Format,"[[{'side': 'Pro', 'text': '  Though I will ...",2009-10-08,alto2osu,simpleton,"[insignia96, LadyHavok13, seeley.linda, Nails,...",Pro
650,0.666667,0.333333,0.0,0.333333,0.333333,0.333333,Singing-Challenge/1/,Arts,Singing Challenge,"[[{'side': 'Pro', 'text': '  Let's begin wi...",2016-05-01,TurleDoveSammie,TUF,"[ResponsiblyIrresponsible, ESocialBookworm, Ro...",Pro
646,1.000000,0.000000,0.0,1.000000,0.000000,0.000000,President-Obama-should-not-have-gotten-the-maj...,Politics,President Obama should not have gotten the maj...,"[[{'side': 'Pro', 'text': '  Barack Obama s...",2013-01-21,Jester02,Deadlykris,"[OhioGary, tmar19652, proglib]",Con
644,0.000000,1.000000,0.0,0.500000,0.500000,0.000000,THBT-Debate.org-should-pay-fifteen-qualified-j...,People,THBT Debate.org should pay fifteen qualified j...,"[[{'side': 'Pro', 'text': '  Rounds -...",2015-05-25,Palmo10,cathaystewie,"[TheJuniorVarsityNovice, F-16_Fighting_Falcon]",Pro


In [2]:
# What two linguistic features did you incorporate in the Ngram+Lex+Ling model?  Stateyour hypothesis about which linguistic features you think would be useful (and why),and perform some data analysis to show that they would be useful to incorporate intoyour model.  Give two examples where these features would help you identify the winnerof a debate.
REPORT_PATH = './resources/report/abs_lexicon_feature/'
NGRAM_ONLY = REPORT_PATH + 'ngram_only.csv'
NGRAM_LEXICON_ONLY = REPORT_PATH + 'ngram_lexicon_only.csv'
NGRAM_LEXICON_LINGUISTIC_ONLY = REPORT_PATH + 'ngram_lexicon_linguistic_only.csv'
NGRAM_LINGUISTIC_ONLY = REPORT_PATH + 'ngram_linguistic_only.csv'
NGRAM_LEXICON_LINGUISTIC_USERS_ONLY = REPORT_PATH + 'ngram_lexicon_linguistic_users_only.csv'

report_df_ngram_only = pd.read_csv(NGRAM_ONLY, index_col=0)
report_df_ngram_lexicon_only = pd.read_csv(NGRAM_LEXICON_ONLY, index_col=0)
report_df_ngram_linguistic_only = pd.read_csv(NGRAM_LINGUISTIC_ONLY, index_col=0)
report_df_ngram_lexicon_linguistic_only = pd.read_csv(NGRAM_LEXICON_LINGUISTIC_ONLY, index_col=0)
report_df_ngram_lexicon_linguistic_users_only = pd.read_csv(NGRAM_LEXICON_LINGUISTIC_USERS_ONLY, index_col=0)

all_report_df = [report_df_ngram_lexicon_linguistic_only, 
                 report_df_ngram_lexicon_only, 
                 report_df_ngram_linguistic_only,
                 report_df_ngram_only,
                 report_df_ngram_lexicon_linguistic_users_only                
                ]

In [3]:
def convert_report_to_dict(report_df):
    val_report = report_df.loc[:, 'val_report'].apply(lambda row: eval(row))
    report_df.loc[:, 'val_report'] = val_report
    train_report = report_df.loc[:, 'train_report'].apply(lambda row: eval(row))
    report_df.loc[:, 'train_report'] = train_report
    
def add_accuracy(report_df):
    val_acc = report_df.loc[:, 'val_report'].apply(lambda row: row['accuracy'])
    report_df['val_accuracy'] = val_acc
    train_acc = report_df.loc[:, 'train_report'].apply(lambda row: row['accuracy'])
    report_df['train_accuracy'] = train_acc
    
def max_val_accuracy_by_linguistic_features(report_df):
    grouped = report_df.groupby(['linguistic_trans'])
    max_val_accuracy_by_linguistic = []
    
    for name, group in grouped:
        max_idx = group['val_accuracy'].argmax()
        max_row = group.iloc[max_idx, :]
        max_val_accuracy_by_linguistic.append(max_row)

    df_max_val_accuracy_by_linguistic = pd.DataFrame(max_val_accuracy_by_linguistic)
    return df_max_val_accuracy_by_linguistic.sort_values(by=['val_accuracy'], ascending=False)

def max_val_accuracy_by_lexicon_features(report_df):
    grouped = report_df.groupby(['lexicon_trans'])
    max_val_accuracy_by_linguistic = []
    
    for name, group in grouped:
        max_idx = group['val_accuracy'].argmax()
        max_row = group.iloc[max_idx, :]
        max_val_accuracy_by_linguistic.append(max_row)

    df_max_val_accuracy_by_lexicon = pd.DataFrame(max_val_accuracy_by_linguistic)
    return df_max_val_accuracy_by_lexicon.sort_values(by=['val_accuracy'], ascending=False)

def max_val_accuracy_by_users_features(report_df):
    grouped = report_df.groupby(['users_trans'])
    max_val_accuracy_by_users = []
    
    for name, group in grouped:
        max_idx = group['val_accuracy'].argmax()
        max_row = group.iloc[max_idx, :]
        max_val_accuracy_by_users.append(max_row)

    df_max_val_accuracy_by_users = pd.DataFrame(max_val_accuracy_by_users)
    return df_max_val_accuracy_by_users.sort_values(by=['val_accuracy'], ascending=False)

In [4]:
for report_df in all_report_df:
    convert_report_to_dict(report_df)
    add_accuracy(report_df)

In [107]:
linguistic_report_df = [report_df_ngram_lexicon_linguistic_only, 
                        report_df_ngram_linguistic_only,
                        report_df_ngram_lexicon_linguistic_users_only                
                        ]
lexicon_report_df = [report_df_ngram_lexicon_linguistic_only, 
                        report_df_ngram_linguistic_only,
                        report_df_ngram_lexicon_only
                        ]

In [142]:
df = max_val_accuracy_by_users_features(report_df_ngram_lexicon_linguistic_users_only).drop(columns=['train_report', 
                                                                                                'val_report']).users_trans

for i in df:
    print(i)

['transformer_get_political_align', 'transformer_get_gender_user_align']
['transformer_get_religious_align', 'transformer_get_education_user_align']
['transformer_get_religious_align', 'transformer_get_political_align']
['transformer_get_political_align', 'transformer_get_party_user_align']
['transformer_get_cosine_similarity ', 'transformer_get_ethnicity_user_align']
['transformer_get_political_align', 'transformer_get_ethnicity_user_align']
['transformer_get_political_align', 'transformer_get_education_user_align']
['transformer_get_cosine_similarity ', 'transformer_get_relationship_user_align']
['transformer_get_education_user_align', 'transformer_get_ethnicity_user_align']
['transformer_get_religious_align', 'transformer_get_relationship_user_align']
['transformer_get_education_user_align', 'transformer_get_party_user_align']
['transformer_get_religious_align', 'transformer_get_ethnicity_user_align']
['transformer_get_cosine_similarity ', 'transformer_get_political_align']
['transf

In [24]:
len('transformer_get_')

16

In [26]:
user_features = max_val_accuracy_by_users_features(report_df_ngram_lexicon_linguistic_users_only).drop(columns=['train_report', 
                                                                                                'val_report']).users_trans

ans = []
for item in user_features:
    list_ = eval(item)
    ans.append(list(map(lambda x: x[16:], list_)))
    
print(ans)
target_df = max_val_accuracy_by_users_features(report_df_ngram_lexicon_linguistic_users_only)
target_df['users_trans'] = ans

[['political_align', 'gender_user_align'], ['religious_align', 'education_user_align'], ['religious_align', 'political_align'], ['political_align', 'party_user_align'], ['cosine_similarity ', 'ethnicity_user_align'], ['political_align', 'ethnicity_user_align'], ['political_align', 'education_user_align'], ['cosine_similarity ', 'relationship_user_align'], ['education_user_align', 'ethnicity_user_align'], ['religious_align', 'relationship_user_align'], ['education_user_align', 'party_user_align'], ['religious_align', 'ethnicity_user_align'], ['cosine_similarity ', 'political_align'], ['political_align', 'relationship_user_align'], ['education_user_align', 'relationship_user_align'], ['religious_align', 'party_user_align'], ['gender_user_align', 'relationship_user_align'], ['religious_align', 'gender_user_align'], ['party_user_align', 'relationship_user_align'], ['cosine_similarity ', 'religious_align'], ['cosine_similarity ', 'party_user_align'], ['education_user_align', 'gender_user_al

In [27]:
latex = target_df.loc[:, ['users_trans', 'val_accuracy']].to_latex()
print(latex)

\begin{tabular}{llr}
\toprule
{} &                                      users\_trans &  val\_accuracy \\
\midrule
15 &             [political\_align, gender\_user\_align] &      0.786967 \\
8  &          [religious\_align, education\_user\_align] &      0.766917 \\
7  &               [religious\_align, political\_align] &      0.766917 \\
14 &              [political\_align, party\_user\_align] &      0.766917 \\
5  &       [cosine\_similarity , ethnicity\_user\_align] &      0.764411 \\
16 &          [political\_align, ethnicity\_user\_align] &      0.764411 \\
13 &          [political\_align, education\_user\_align] &      0.764411 \\
6  &    [cosine\_similarity , relationship\_user\_align] &      0.761905 \\
20 &     [education\_user\_align, ethnicity\_user\_align] &      0.761905 \\
12 &       [religious\_align, relationship\_user\_align] &      0.759398 \\
18 &         [education\_user\_align, party\_user\_align] &      0.759398 \\
11 &          [religious\_align, ethnicity\_user\

In [105]:
max_val_accuracy_by_linguistic_features(linguistic_report_df[0])

Unnamed: 0,ngram_trans,lexicon_trans,linguistic_trans,users_trans,train_report,val_report,val_accuracy,train_accuracy
44,['trigram'],['vad'],"['reference_to_opponent', 'swear_words']",[],"{'0': {'precision': 0.7689119170984456, 'recal...","{'0': {'precision': 0.7389558232931727, 'recal...",0.769424,0.750628
51,['trigram'],['vad'],"['swear_words', 'personal_pronouns']",[],"{'0': {'precision': 0.7771911298838438, 'recal...","{'0': {'precision': 0.7330677290836654, 'recal...",0.764411,0.754397
55,['trigram'],['vad'],"['swear_words', 'number']",[],"{'0': {'precision': 0.7635416666666667, 'recal...","{'0': {'precision': 0.7269076305220884, 'recal...",0.754386,0.742462
52,['trigram'],['vad'],"['swear_words', 'questions']",[],"{'0': {'precision': 0.7522842639593909, 'recal...","{'0': {'precision': 0.7198443579766537, 'recal...",0.754386,0.736809
47,['trigram'],['vad'],"['reference_to_opponent', 'websites']",[],"{'0': {'precision': 0.7722980062959076, 'recal...","{'0': {'precision': 0.7276422764227642, 'recal...",0.75188,0.750628
37,['trigram'],['vad'],"['length', 'swear_words']",[],"{'0': {'precision': 0.7689873417721519, 'recal...","{'0': {'precision': 0.7295081967213115, 'recal...",0.75188,0.744975
56,['trigram'],['vad'],"['swear_words', 'modal_verb']",[],"{'0': {'precision': 0.7664921465968586, 'recal...","{'0': {'precision': 0.7333333333333333, 'recal...",0.75188,0.744347
60,['trigram'],['vad'],"['personal_pronouns', 'number']",[],"{'0': {'precision': 0.7623456790123457, 'recal...","{'0': {'precision': 0.7176470588235294, 'recal...",0.749373,0.744975
66,['trigram'],['vad'],"['websites', 'exclamation']",[],"{'0': {'precision': 0.7642782969885774, 'recal...","{'0': {'precision': 0.7176470588235294, 'recal...",0.749373,0.744347
36,['trigram'],['vad'],"['length', 'reference_to_opponent']",[],"{'0': {'precision': 0.7762312633832976, 'recal...","{'0': {'precision': 0.7246963562753036, 'recal...",0.749373,0.748744


In [9]:
max_val_accuracy_by_linguistic_features(linguistic_report_df[1])

Unnamed: 0,ngram_trans,lexicon_trans,linguistic_trans,users_trans,train_report,val_report,val_accuracy,train_accuracy
15,['trigram'],[],"['swear_words', 'personal_pronouns']",[],"{'0': {'precision': 0.8252947481243301, 'recal...","{'0': {'precision': 0.7416666666666667, 'recal...",0.761905,0.805905
31,['trigram'],[],"['websites', 'number']",[],"{'0': {'precision': 0.8487654320987654, 'recal...","{'0': {'precision': 0.7233201581027668, 'recal...",0.754386,0.850503
28,['trigram'],[],"['questions', 'number']",[],"{'0': {'precision': 0.8577319587628865, 'recal...","{'0': {'precision': 0.724, 'recall': 0.8578199...",0.75188,0.860553
14,['trigram'],[],"['reference_to_opponent', 'modal_verb']",[],"{'0': {'precision': 0.8571428571428571, 'recal...","{'0': {'precision': 0.726530612244898, 'recall...",0.749373,0.864322
3,['trigram'],[],"['length', 'questions']",[],"{'0': {'precision': 0.8108395324123273, 'recal...","{'0': {'precision': 0.7217741935483871, 'recal...",0.746867,0.792085
18,['trigram'],[],"['swear_words', 'exclamation']",[],"{'0': {'precision': 0.8670756646216768, 'recal...","{'0': {'precision': 0.7217741935483871, 'recal...",0.746867,0.875628
13,['trigram'],[],"['reference_to_opponent', 'number']",[],"{'0': {'precision': 0.8297872340425532, 'recal...","{'0': {'precision': 0.7235772357723578, 'recal...",0.746867,0.833543
35,['trigram'],[],"['number', 'modal_verb']",[],"{'0': {'precision': 0.8273453093812375, 'recal...","{'0': {'precision': 0.706766917293233, 'recall...",0.746867,0.836683
34,['trigram'],[],"['exclamation', 'modal_verb']",[],"{'0': {'precision': 0.8517766497461929, 'recal...","{'0': {'precision': 0.7171314741035857, 'recal...",0.744361,0.859925
22,['trigram'],[],"['personal_pronouns', 'websites']",[],"{'0': {'precision': 0.8310880829015544, 'recal...","{'0': {'precision': 0.7224489795918367, 'recal...",0.744361,0.826005


In [140]:
max_val_accuracy_by_linguistic_features(linguistic_report_df[0])

Unnamed: 0,ngram_trans,lexicon_trans,linguistic_trans,users_trans,train_report,val_report,val_accuracy,train_accuracy
44,['trigram'],['vad'],"['reference_to_opponent', 'swear_words']",[],"{'0': {'precision': 0.7689119170984456, 'recal...","{'0': {'precision': 0.7389558232931727, 'recal...",0.769424,0.750628
51,['trigram'],['vad'],"['swear_words', 'personal_pronouns']",[],"{'0': {'precision': 0.7771911298838438, 'recal...","{'0': {'precision': 0.7330677290836654, 'recal...",0.764411,0.754397
55,['trigram'],['vad'],"['swear_words', 'number']",[],"{'0': {'precision': 0.7635416666666667, 'recal...","{'0': {'precision': 0.7269076305220884, 'recal...",0.754386,0.742462
52,['trigram'],['vad'],"['swear_words', 'questions']",[],"{'0': {'precision': 0.7522842639593909, 'recal...","{'0': {'precision': 0.7198443579766537, 'recal...",0.754386,0.736809
47,['trigram'],['vad'],"['reference_to_opponent', 'websites']",[],"{'0': {'precision': 0.7722980062959076, 'recal...","{'0': {'precision': 0.7276422764227642, 'recal...",0.75188,0.750628
37,['trigram'],['vad'],"['length', 'swear_words']",[],"{'0': {'precision': 0.7689873417721519, 'recal...","{'0': {'precision': 0.7295081967213115, 'recal...",0.75188,0.744975
56,['trigram'],['vad'],"['swear_words', 'modal_verb']",[],"{'0': {'precision': 0.7664921465968586, 'recal...","{'0': {'precision': 0.7333333333333333, 'recal...",0.75188,0.744347
60,['trigram'],['vad'],"['personal_pronouns', 'number']",[],"{'0': {'precision': 0.7623456790123457, 'recal...","{'0': {'precision': 0.7176470588235294, 'recal...",0.749373,0.744975
66,['trigram'],['vad'],"['websites', 'exclamation']",[],"{'0': {'precision': 0.7642782969885774, 'recal...","{'0': {'precision': 0.7176470588235294, 'recal...",0.749373,0.744347
36,['trigram'],['vad'],"['length', 'reference_to_opponent']",[],"{'0': {'precision': 0.7762312633832976, 'recal...","{'0': {'precision': 0.7246963562753036, 'recal...",0.749373,0.748744


KeyError: 1

In [121]:
print(max_val_accuracy_by_linguistic_features(report_df_ngram_lexicon_linguistic_only).drop(columns=['train_report', 'ngram_trans', 'val_report', 'users_trans']).to_latex())

\begin{tabular}{lllrr}
\toprule
{} &    lexicon\_trans &                                linguistic\_trans &  val\_accuracy &  train\_accuracy \\
\midrule
44 &          ['vad'] &        ['reference\_to\_opponent', 'swear\_words'] &      0.769424 &        0.750628 \\
51 &          ['vad'] &            ['swear\_words', 'personal\_pronouns'] &      0.764411 &        0.754397 \\
55 &          ['vad'] &                       ['swear\_words', 'number'] &      0.754386 &        0.742462 \\
52 &          ['vad'] &                    ['swear\_words', 'questions'] &      0.754386 &        0.736809 \\
47 &          ['vad'] &           ['reference\_to\_opponent', 'websites'] &      0.751880 &        0.750628 \\
37 &          ['vad'] &                       ['length', 'swear\_words'] &      0.751880 &        0.744975 \\
56 &          ['vad'] &                   ['swear\_words', 'modal\_verb'] &      0.751880 &        0.744347 \\
60 &          ['vad'] &                 ['personal\_pronouns', 'number'

In [139]:
max_val_accuracy_by_lexicon_features(lexicon_report_df[2])

Unnamed: 0,ngram_trans,lexicon_trans,linguistic_trans,users_trans,train_report,val_report,val_accuracy,train_accuracy
1,['trigram'],['vad'],[],[],"{'0': {'precision': 0.817047817047817, 'recall...","{'0': {'precision': 0.7075098814229249, 'recal...",0.734336,0.807789
0,['trigram'],['connotation'],[],[],"{'0': {'precision': 0.7622682660850599, 'recal...","{'0': {'precision': 0.7035573122529645, 'recal...",0.729323,0.726759


print(max_val_accuracy_by_lexicon_features(linguistic_report_df[0]).drop(columns=['train_report', 'val_report', 'ngram_trans', 'linguistic_trans', 'users_trans']).to_latex())