In [1]:
import collections
from numpy.lib.function_base import vectorize
import pandas as pd 
import numpy as np
import json
from scipy import sparse
import sklearn.metrics
import sklearn.neighbors
import sklearn.linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from better_profanity import profanity
import time

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# loading user  data 
USER_DATA = './resources/data/users.json'
df_user = pd.read_json(USER_DATA, orient="index")

# loading training data .jsonl
TRAINING_DATA = './resources/data/train.jsonl'
VAL_DATA = './resources/data/val.jsonl'

df_train, df_val = pd.read_json(TRAINING_DATA, lines=True), pd.read_json(VAL_DATA, lines=True)

# Getting Linguistic Features

In [78]:
class Transformer_identity(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return X

class Transformer_get_length(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None): 
        '''X should have two columns. One is the document and ther other is the corresponding unigram vector'''
        # Count the number if unigrams in a feature
        unigram = X["unigram"]
        
        length = unigram.sum(axis=1)
        return length
class Transformer_get_reference_to_opponent(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None): 
        # Count the number of times the opponent's username is mentioned 
        df = X["df"]
        
        count = df.apply(lambda row: row["document"].lower().count(row["opponent"]),
                         axis=1).values
        
        count = np.reshape(count, newshape=[-1, 1])
        return count 
    
class Transformer_get_swear_words(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.matrix = None
    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
    #     perhaps get rid some of the swear words because they look like they are necessary words 
    #     for discussion such as arian, sodom 
        unigram = X['unigram']
        unigram_vectorizer = X['unigram_vectorizer']
        
        if not self.matrix:
            vector = list(map(lambda x: int(profanity.contains_profanity(x)), 
                                    unigram_vectorizer.get_feature_names()))
            
            self.matrix = np.reshape(vector, newshape=[-1, 1])

        swear_pro = unigram @ self.matrix
        return swear_pro

class Transformer_get_personal_pronouns(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.personal_pronouns = pd.Series(
            ["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"])
        self.matrix = None
    def fit(self, X, y=None):
        return self 
    def transform(self, X, y=None):
    #     ================== faster but less accurate ========================

        document = X['df'].loc[:, 'document']

        all_counts = []

        for name in self.personal_pronouns:
            count = np.array(list(map(lambda x: x.count(" {} ".format(name)), document)))
            count = np.reshape(count, newshape=[-1, 1])
            all_counts.append(count)


        personal_pronouns_feature = np.hstack(all_counts)
        return personal_pronouns_feature

    #     ================== more accurate but slower ========================
        # personal_pronouns_vector = unigram_vectorizer.transform(personal_pronouns)
        # matrix_person_pronouns = personal_pronouns_vector.T 

        # document_pro = document_side[:, 0]
        # unigram_pro = vectorizer.transform(document_pro)
        # personal_pronouns_feature_pro = unigram_pro @ matrix_person_pronouns
        # I_count_pro = np.array(list(map(lambda x: x.count(" I "), document_pro)))
        # I_count_pro = np.reshape(I_count_pro, newshape=[-1, 1])
        # personal_pronouns_feature_pro = sparse.hstack([personal_pronouns_feature_pro, I_count_pro])

        # document_con = document_side[:, 1]
        # unigram_con = vectorizer.transform(document_con)
        # personal_pronouns_feature_con = unigram_con @ matrix_person_pronouns
        # I_count_con = np.array(list(map(lambda x: x.count(" I "), document_con)))
        # I_count_con = np.reshape(I_count_con, newshape=[-1, 1])
        # personal_pronouns_feature_con = sparse.hstack([personal_pronouns_feature_con, I_count_con])
        
class Transformer_get_ngrams(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self 
    def transform(self, X, side, y=None):
        opponent = "con_debater" if side == "Pro" else "pro_debater"
        document = X['document'] 
        df_train = X['df_train'] 
        unigram_vectorizer = X['unigram_vectorizer']
        trigram_vectorizer = X['trigram_vectorizer']
        
        df = pd.DataFrame.from_dict(
            {
                "document": document,
                "opponent": df_train.loc[:, opponent]
            }
        )

        feature = {"df": df, 
                    "unigram": unigram_vectorizer.transform(document),
                    'trigram': trigram_vectorizer.transform(document),
                    "unigram_vectorizer": unigram_vectorizer
                    }
        
        return feature

class Transformer_separate_document(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.unigram_vectorizer = None
        self.trigram_vectorizer = None
        pass
    def fit(self, X, y=None):
        if not self.unigram_vectorizer:
            self.unigram_vectorizer = CountVectorizer()
            self.unigram_vectorizer.fit(document)
            
        if not self.trigram_vectorizer:
            self.trigram_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, 
                                                 min_df=0.2, stop_words='english', ngram_range=(1,3))
            self.trigram_vectorizer.fit(document)
            
        return self 
    
    def transform(self, df, y=None):
        document_side = get_text_by_side(df)
        document = [side[0] + side[1] for side in document_side]
        
        return {"Pro": 
                        {"df": df,
                        "unigram": self.unigram_vectorizer,
                        "trigram": self.trigram_vectorizer,
                        "document": document_side[:, d]},
               "Con": 
                        {"df": df,
                        "unigram": self.unigram_vectorizer,
                        "trigram": self.trigram_vectorizer,
                        "document": document_side[:, 1]}
               }
        
        
transformer_get_length = Transformer_get_length()
transformer_identity = Transformer_identity()
transformer_get_reference_to_opponent = Transformer_get_reference_to_opponent()
transformer_get_swear_words = Transformer_get_swear_words()
transformer_get_personal_pronouns = Transformer_get_personal_pronouns()
transformer_separate_document = Transformer_separate_document()
transformer_get_ngrams = Transformer_get_ngrams()

linguistic_trans = FeatureUnion(
    [
        ('length', transformer_get_length),
        ('reference_to_opponent', transformer_get_reference_to_opponent),
        ('swear_words', transformer_get_swear_words),
        ('personal_pronouns', transformer_get_personal_pronouns)
    ]
)

side_trans = Pipeline(
    [
        ('ngram', transformer_get_ngrams), 
        ('linguistic', linguistic_trans)
    ]
)

both_trans = ColumnTransformer(
    [
        ('Pro', side_trans, 'Pro'),
        ('Con', side_trans, 'Con')
    ]
)
                
big_trans = Pipeline(
    [
        ('separate_document', transformer_separate_document), 
        ('get_both_features', both_trans),
        ('logistic_regression', sklearn.linear_model.LogisticRegression())
    ]
)

# linguistic_feat = linguistic_trans.fit(input_pro).transform(input_pro)

In [73]:
both_side_trans.fit


array([[31,  8, 18, ...,  2,  0,  1],
       [25, 20,  6, ...,  0,  5,  4],
       [32, 22,  0, ...,  0,  2,  0],
       ...,
       [ 7, 14, 19, ...,  0,  1,  4],
       [ 0,  5,  0, ...,  0,  0,  0],
       [15, 15,  0, ...,  0,  2,  0]])

In [48]:
df_pro = pd.DataFrame.from_dict({"document": document_pro, 
                                "unigram": unigram_vectorizer.transform(document_pro),
                                "opponent": df_train["con_debater"]})
df_pro.loc[0,:]["opponent"]
print(df_pro.loc[0, :])
print(df_train.loc[0, ])

document    \n  \n  Thank you, Muted, for accepting this d...
unigram       (10, 0)\t1\n  (14, 0)\t1\n  (21, 0)\t1\n  (2...
opponent                                                Muted
Name: 0, dtype: object
id                      Atheism-is-more-probable-than-Theism./2/
category                                                Religion
title                      Atheism is more probable than Theism.
rounds         [[{'side': 'Pro', 'text': '
  
  Thank you, Mu...
date                                         2012-11-11 00:00:00
pro_debater                                            Microsuck
con_debater                                                Muted
voters         [truthseeker613, emj32, RationalMadman, Magic8...
winner                                                       Pro
Name: 0, dtype: object


In [38]:
# print([row.todense() for row in linguistic_feat])
print(linguistic_trans.get_feature_names())

AttributeError: Transformer length (type Transformer_get_length) does not provide get_feature_names.

In [3]:
# (a) Length
# (b) Reference to the opponent
# (c) Politeness words
# (d) Swear words
# (e) Personal pronouns
# (f) Modal verbs
# (g) Misspellings
# (h) Links to outside websites
# (i) Numbers
# (j) Exclamation points
# (k) Questions

def get_length(document_side, vectorizer): 
    # Count the number if unigrams in a feature
    document_pro = document_side[:, 0]
    length_pro = np.sum(vectorizer(document_pro), axis=1)
    
    document_con = document_side[:, 1]
    length_con = np.sum(vectorizer(document_con), axis=1)
    
    return length_pro, length_con

def get_reference_to_opponent(df, document_side, vectorizer): 
    # Count the number of times the opponent's username is mentioned 
    pro_count = []
    con_count = []
    document_pro = document_side[:, 0]
    document_con = document_side[:, 1]
    
    for i in range(df.shape[0]):
        opponent_name = df.loc[i, "con_debator"]
        pro_count.append(document_pro.lower().count(opponent_name))
        
        opponent_name = df.loc[i, "pro_debator"]
        con_count.append(document_con.lower().count(opponent_name))
        
    return np.array(pro_count), np.array(con_count) 

def get_politeness_words(document_side, vectorizer):
    pass


def get_swear_words(document_side, vectorizer):
#     perhaps get rid some of the swear words because they look like they are necessary words 
#     for discussion such as arian, sodom 
    unigram = vectorizer.get_feature_names() 
    vector_swear = list(map(lambda x: int(profanity.contains_profanity(x)), unigram))
    matrix_swear = np.reshape(vector_swear, newshape=[-1, 1])

    document_pro = document_side[:, 0]
    unigram_pro = vectorizer.transform(document_pro)
    swear_pro = unigram_pro @ matrix_swear

    document_con = document_side[:, 1]
    unigram_con = vectorizer.transform(document_con)
    swear_con = unigram_con @ matrix_swear

def get_personal_pronouns(document_side, vectorizer):
#     ================== faster but less accurate ========================
    personal_pronouns = pd.Series(["I", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"])

    document_pro = document_side[:, 0]
    document_con = document_side[:, 1]

    all_counts_pro = []
    all_counts_con = []

    for name in personal_pronouns:
        count_pro = np.array(list(map(lambda x: x.count(" {} ".format(name)), document_pro)))
        count_pro = np.reshape(count_pro, newshape=[-1, 1])
        all_counts_pro.append(count_pro)

        count_con = np.array(list(map(lambda x: x.count(" {} ".format(name)), document_con)))
        count_con = np.reshape(count_con, newshape=[-1, 1])
        all_counts_con.append(count_con)

    personal_pronouns_feature_pro = np.hstack(all_counts_pro)
    personal_connouns_feature_con = np.hstack(all_counts_con)

#     ================== more accurate but slower ========================
    # personal_pronouns_vector = unigram_vectorizer.transform(personal_pronouns)
    # matrix_person_pronouns = personal_pronouns_vector.T 

    # document_pro = document_side[:, 0]
    # unigram_pro = vectorizer.transform(document_pro)
    # personal_pronouns_feature_pro = unigram_pro @ matrix_person_pronouns
    # I_count_pro = np.array(list(map(lambda x: x.count(" I "), document_pro)))
    # I_count_pro = np.reshape(I_count_pro, newshape=[-1, 1])
    # personal_pronouns_feature_pro = sparse.hstack([personal_pronouns_feature_pro, I_count_pro])

    # document_con = document_side[:, 1]
    # unigram_con = vectorizer.transform(document_con)
    # personal_pronouns_feature_con = unigram_con @ matrix_person_pronouns
    # I_count_con = np.array(list(map(lambda x: x.count(" I "), document_con)))
    # I_count_con = np.reshape(I_count_con, newshape=[-1, 1])
    # personal_pronouns_feature_con = sparse.hstack([personal_pronouns_feature_con, I_count_con])

    return personal_pronouns_feature_pro, personal_pronouns_feature_con
    
        
def get_questions(document_side, vectorizer):
    
    document_pro = document_side[:, 0]
    question_count_pro = np.array(list(map(lambda x: x.count("?"), document_pro)))

    document_con = document_side[:, 1]
    question_count_con = np.array(list(map(lambda x: x.count("?"), document_con)))
    
    return question_count_pro, question_count_con
    
    
def get_reference_website(document_side, vectorizer):
    
    document_pro = document_side[:, 0]
    website_count_pro = np.array(list(map(lambda x: x.count("http"), document_pro)))

    document_con = document_side[:, 1]
    website_count_con = np.array(list(map(lambda x: x.count("http"), document_con)))
    
    return website_count_pro, website_count_con

def get_exclamation(document_side, vectorizer):
    
    document_pro = document_side[:, 0]
    exclamation_count_pro = np.array(list(map(lambda x: x.count("!"), document_pro)))

    document_con = document_side[:, 1]
    exclamation_count_con = np.array(list(map(lambda x: x.count("!"), document_con)))
    
    return exclamation_count_pro, exclamation_count_con

def get_number(document_side, vectorizer):
    
    unigram = unigram_vectorizer.get_feature_names()
    vector_number = list(map(lambda x: int(x[0].isnumeric()), unigram))
    matrix_number = np.reshape(vector_number, newshape=[-1, 1])

    document_pro = document_side[:, 0]
    unigram_pro = vectorizer.transform(document_pro)
    number_pro = unigram_pro @ matrix_number

    document_con = document_side[:, 1]
    unigram_con = vectorizer.transform(document_con)
    number_con = unigram_con @ matrix_number
    
    return number_pro, number_con

def get_modal_verb(document_side, vectorizer):
    modal_verbs = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must"])
    
    unigram = unigram_vectorizer.get_feature_names()
    vector_modal_verb = list(map(lambda x: int(x in modal_verbs), unigram))
    matrix_modal_verb = np.reshape(vector_modal_verb, newshape=[-1, 1])

    document_pro = document_side[:, 0]
    unigram_pro = vectorizer.transform(document_pro)
    modal_verb_pro = unigram_pro @ matrix_modal_verb

    document_con = document_side[:, 1]
    unigram_con = vectorizer.transform(document_con)
    modal_verb_con = unigram_con @ matrix_modal_verb
    
    return modal_verb_pro, modal_verb_con


In [None]:
from autocorrect import Speller
spell = Speller()
spell.existing("I'm not sleapy and tehre is no place I'm giong to.")

In [None]:
get_modal_verb(document_side, unigram_vectorizer)

In [4]:
def get_jsonl(path):

    with open(path) as json_file:
        json_list = list(json_file)

    data_list = []
    for json_str in json_list:
        data_list.append(json.loads(json_str))

    return pd.DataFrame(data_list)
def get_texts(df):
    '''
    Return a list of statements in df without differentiating the side of the speaker
    '''

    texts = []
    for round in df.loc[:, 'rounds']:
        for sub_round in round:
            for speech in sub_round:
                texts.append(speech['text'])

    return texts

def get_text_by_side(df): 
    '''
    Return a list of documents where each document contains all text on one side in a 
    single debate
    
    text = [[Pro statement 1, Pro statement 2, ... Pro statement n],
            [Con statement 1, Con statement 2, ... Con statement m]]
            where n, m is the total number of statements from Pro and Con side across
            all debates

    size: [n x 2 x # statements in each debate]
    '''

    text = []
    for round in df.loc[:, 'rounds']:
        round_text = collections.defaultdict(list)

        for sub_round in round:
            for speech in sub_round: 
                round_text[speech['side']].append(speech['text'])

        
        text.append(["".join(round_text['Pro']), "".join(round_text['Con'])])

    return np.array(text)

def get_ngram_feature(document_side, vectorizer): 
    '''
    Return the ngram features associated with a single debate

    For pro side, each document is defined as a string that contains all the statements 
    from the pro side in a single debate (across different subrounds). Con side is 
    similarly defined. 

    return [[Pro side n gram vector, Con side n gram vector for 1 debate],
            [Pro side n gram vector, Con side n gram vector for 2 debate],
            ...]

            size: [n, 2 x ngram count]
    
    Pro side and con side n gram vector are concatenated.
    '''

    pro_document = document_side[:, 0]
    con_document = document_side[:, 1]

    pro_feature = vectorizer.transform(pro_document)
    con_feature = vectorizer.transform(con_document)
    return sparse.hstack([pro_feature, con_feature])   

def get_debate_feature(df):
    '''
    Return the debate feature such as category, pro_debator user name, etc

    feature: [n, # of features] 
    '''
    feature_name = ['category']
    feature = []

    for name in feature_name: 
        # TODO: check for data type of the column. If non-numeric, then do this
        # otherwise, use the numerical data
        encoding, unique_feature_val = pd.factorize(df[name])
        feature.append(encoding)

    return np.reshape(np.array(feature), [-1, len(feature_name)])

def get_connotation_feature(document_side, matrix_connotation, vectorizer):
    pro_document = document_side[:, 0]
    con_document = document_side[:, 1]
    
    gram_pro = vectorizer.transform(pro_document)
    gram_con = vectorizer.transform(con_document)
    
    feature_pro = gram_pro @ matrix_connotation
    feature_con = gram_con @ matrix_connotation
    
    return np.hstack([feature_pro, feature_con])

def get_connotation_percentage_feature(document_side, matrix_connotation, vectorizer):
    # create features where count of features are percentage points 
    pro_document = document_side[:, 0]
    gram_pro = vectorizer.transform(pro_document)
    feature_pro = gram_pro @ matrix_connotation
    total_feature_count = np.reshape(np.sum(feature_pro, axis=1), newshape=(-1, 1))
    feature_pct_pro = np.divide(feature_pro, total_feature_count)
    feature_pct_pro[np.isneginf(feature_pct_pro)]=0
    feature_pct_pro[np.isnan(feature_pct_pro)]=0
    
    con_document = document_side[:, 1]
    gram_con = vectorizer.transform(con_document)
    feature_con = gram_con @ matrix_connotation
    total_feature_count = np.reshape(np.sum(feature_con, axis=1), newshape=(-1, 1))
    feature_pct_con = np.divide(feature_con, total_feature_count)
    feature_pct_con[np.isneginf(feature_pct_con)]=0
    feature_pct_con[np.isnan(feature_pct_con)]=0
    
    return np.hstack([feature_pct_pro, feature_pct_con])

def get_connotation_ln_feature(document_side, matrix_connotation, vectorizer):
    # create features where count of features are ln points 
    pro_document = document_side[:, 0]
    gram_pro = vectorizer.transform(pro_document)
    feature_pro = gram_pro @ matrix_connotation
    feature_ln_pro = np.log(feature_pro)
    feature_ln_pro[np.isneginf(feature_ln_pro)]=0
    feature_ln_pro[np.isnan(feature_ln_pro)]=0
    
    con_document = document_side[:, 0]
    gram_con = vectorizer.transform(con_document)
    feature_con = gram_con @ matrix_connotation
    feature_ln_con = np.log(feature_con)
    feature_ln_con[np.isneginf(feature_ln_con)]=0
    feature_ln_con[np.isnan(feature_ln_con)]=0
    
    return np.hstack([feature_ln_pro, feature_ln_con])


def get_vad_feature(document_side, matrix_vad, vectorizer):
    pro_document = document_side[:, 0]
    con_document = document_side[:, 1]
    
    gram_pro = vectorizer.transform(pro_document)
    gram_con = vectorizer.transform(con_document)
    
    feature_pro = gram_pro @ matrix_vad
    feature_con = gram_con @ matrix_vad
    
    return np.hstack([feature_pro, feature_con])

def get_vad_percentage_feature(document_side, matrix_vad, vectorizer):
    # create features where count of features are percentage points 
    pro_document = document_side[:, 0]
    gram_pro = vectorizer.transform(pro_document)
    feature_pro = gram_pro @ matrix_vad
    total_feature_count = np.reshape(np.sum(feature_pro, axis=1), newshape=(-1, 1))
    feature_pct_pro = np.divide(feature_pro, total_feature_count)
    feature_pct_pro[np.isneginf(feature_pct_pro)]=0
    feature_pct_pro[np.isnan(feature_pct_pro)]=0
    
    con_document = document_side[:, 1]
    gram_con = vectorizer.transform(con_document)
    feature_con = gram_con @ matrix_vad
    total_feature_count = np.reshape(np.sum(feature_con, axis=1), newshape=(-1, 1))
    feature_pct_con = np.divide(feature_con, total_feature_count)
    feature_pct_con[np.isneginf(feature_pct_con)]=0
    feature_pct_con[np.isnan(feature_pct_con)]=0
    
    return np.hstack([feature_pct_pro, feature_pct_con])

def get_vad_ln_feature(document_side, matrix_vad, vectorizer):
    # create features where count of features are ln points 
    pro_document = document_side[:, 0]
    gram_pro = vectorizer.transform(pro_document)
    feature_pro = gram_pro @ matrix_vad
    feature_ln_pro = np.log(feature_pro)
    feature_ln_pro[np.isneginf(feature_ln_pro)]=0
    feature_ln_pro[np.isnan(feature_ln_pro)]=0
    
    con_document = document_side[:, 0]
    gram_con = vectorizer.transform(con_document)
    feature_con = gram_con @ matrix_vad
    feature_ln_con = np.log(feature_con)
    feature_ln_con[np.isneginf(feature_ln_con)]=0
    feature_ln_con[np.isnan(feature_ln_con)]=0
    
    return np.hstack([feature_ln_pro, feature_ln_con])

def get_winner(df): 
    '''
    Cons gets mapped to 0 and pro gets mapped to 1
    '''
    return df.loc[:, "winner"].replace({"Con": 0, "Pro": 1})

def get_all_feature_label(df, vectorizer):
    '''
    Return the training input and validation input that contains all features, 
    which are ngram features and debate features
    '''
    
    # Getting two sets of features - ngram and debate related features
    ngram_feature = get_ngram_feature(df, vectorizer)

    # debate_feature = get_debate_feature(df)

    # Combining two sets of features
    # X = sparse.hstack([debate_feature, ngram_feature])
    X = sparse.hstack([ngram_feature])

    y = np.array(get_winner(df))

    return X, y

# Model 2 - lex feature, debate feature, n-gram feature
This model should use
1. word ngrams
2. lexicon based features: implement lexicon based features for a lexicon of your choice
   1. Connotation lexicon
   2. NRC-VAD lexicon
   3. How you extract features is part of the desgin decision that you need to make. One simple example for lexical features could be counting how many words in each debaters language appear in the corresponding lexicon. 

TODO: 
1. Read connotation - 1 file
2. NRC features - 2 files 

In [5]:
# 1. Read connotation - 1 file
# 2. NRC features - 2 files 
CONNOTATION = "./resources/lexica/connotation_lexicon_a.0.1.csv"
NRC_LEXICON_VAD = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt"
NRC_LEXICON_SORTED_VALENCE = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/v-scores.txt"
NRC_LEXICON_SORTED_AROUSAL = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/a-scores.txt"
NRC_LEXICON_SORTED_DOMINANCE = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/d-scores.txt"

df_connotation = pd.read_csv(CONNOTATION, sep=",|_", header=None)
df_connotation.columns = ["word", "pos", "connotation"] # word, part of speech, connotation
df_connotation = df_connotation.dropna() # There are five words in the connotation that are nan 
df_connotation = df_connotation.set_index("word")
df_connotation["pos"] = df_connotation["pos"].astype('category')
df_connotation = df_connotation.drop(columns=["pos"]) # drop the part of speech classification because we can't use it now 
df_connotation["connotation"] = df_connotation["connotation"].astype('category')
df_connotation = pd.get_dummies(df_connotation)

df_nrc_vad = pd.read_csv(NRC_LEXICON_VAD, sep="	", header=None)
df_nrc_vad.columns = ["word", "valence", "arousal", "dominance"]
df_nrc_vad = df_nrc_vad.dropna()
df_nrc_vad = df_nrc_vad.set_index("word")
df_nrc_vad["valence"] = df_nrc_vad["valence"].astype('category')
df_nrc_vad["arousal"] = df_nrc_vad["arousal"].astype('category')
df_nrc_vad["dominance"] = df_nrc_vad["dominance"].astype('category')

  if __name__ == '__main__':


In [6]:
# Get features and labels for traininig and validation 
unigram_vectorizer = CountVectorizer()

# Generate the corpus for vectotrizer to fit on 
document_train_side = get_text_by_side(df_train)
document_val_side = get_text_by_side(df_val)
document_train = [side[0] + side[1] for side in document_train_side]
document_val = [side[0] + side[1] for side in document_val_side]

# The vectorizer trains all all the textual corpus regardless of the side 
# of the debate 
unigram_vectorizer.fit(document_train)

# Get the feature vector of a sentence using ngram @ matrix_connotation
# Creating the matrix 
word_connotation = df_connotation.index
word_vector_connotation = unigram_vectorizer.transform(word_connotation)
matrix_connotation = word_vector_connotation.T @ df_connotation
matrix_connotation_no_neutral = word_vector_connotation.T @ df_connotation.drop(columns=["connotation_neutral"])

word_vad = df_nrc_vad.index
word_vector_vad = unigram_vectorizer.transform(word_vad)
matrix_vad = word_vector_vad.T @ df_nrc_vad
# For words with mulitple part of speech, we are counting the total
# sum across all part of speech of that word for each feature 

# Get label 
label_train = get_winner(df_train)
label_val = get_winner(df_val)

y_train = np.array(label_train)
y_val = np.array(label_val)

# Get more grams 
trigram_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=0.2, stop_words='english', ngram_range=(1,3))
trigram_vectorizer.fit(document_train)

TfidfVectorizer(max_df=0.8, min_df=0.2, ngram_range=(1, 3),
                stop_words='english', sublinear_tf=True)

In [None]:
# ================== you can run experiments here ======================
# Get all TRAINING features:
# Get the documents on pro and con side so that we can forming feature 
# vector on both sides for training 

trigram_train = get_ngram_feature(document_side=document_train_side, vectorizer=trigram_vectorizer)
# ============= using raw number counts of the feature ==========================
# feature_connotation_train = get_connotation_feature(document_side=document_train_side,
#                                                         matrix_connotation=matrix_connotation,
#                                                         vectorizer=unigram_vectorizer)
# feature_vad_train = get_vad_feature(document_side=document_train_side,
#                                                         matrix_vad=matrix_vad,
#                                                         vectorizer=unigram_vectorizer)
# feature_train = sparse.hstack([trigram_train, feature_connotation_train, 
#                     feature_vad_train])

# ============= using percentage counts of the feature ==========================
# feature_connotation_pct_train = get_connotation_percentage_feature(document_train_side, matrix_connotation, unigram_vectorizer)
# feature_vad_pct_train = get_vad_percentage_feature(document_train_side, matrix_vad, unigram_vectorizer)
# feature_train = sparse.hstack([trigram_train, feature_connotation_pct_train, 
#                     feature_vad_pct_train])

# ============= using log counts of the feature ==========================
# feature_connotation_ln_train = get_connotation_ln_feature(document_train_side, matrix_connotation, unigram_vectorizer)
# feature_vad_ln_train = get_vad_ln_feature(document_train_side, matrix_vad, unigram_vectorizer)
# feature_train = sparse.hstack([trigram_train, feature_connotation_ln_train, 
#                     feature_vad_ln_train])

# ============= using percentage counts of the feature without neutral connotation ==========================
feature_connotation_pct_train = get_connotation_percentage_feature(document_train_side, matrix_connotation_no_neutral, unigram_vectorizer)
feature_vad_pct_train = get_vad_percentage_feature(document_train_side, matrix_vad, unigram_vectorizer)
feature_train = sparse.hstack([trigram_train, feature_connotation_pct_train, 
                    feature_vad_pct_train])

# Get all VALIDATION features:
trigram_val = get_ngram_feature(document_side=document_val_side, vectorizer=trigram_vectorizer)
# ============= using raw counts of of the feature ==========================
# feature_connotation_val = get_connotation_feature(document_side=document_val_side,
#                                                         matrix_connotation=matrix_connotation,
#                                                         vectorizer=unigram_vectorizer)
# feature_vad_val = get_vad_feature(document_side=document_val_side,
#                                                         matrix_vad=matrix_vad,
#                                                         vectorizer=unigram_vectorizer)
# feature_val = sparse.hstack([trigram_val, feature_connotation_val, 
#                     feature_vad_val])

# ============= using percentage count of of the feature ==========================
# feature_connotation_pct_val = get_connotation_percentage_feature(document_val_side, matrix_connotation, unigram_vectorizer)
# feature_vad_pct_val = get_vad_percentage_feature(document_val_side, matrix_vad, unigram_vectorizer)
# feature_train = sparse.hstack([trigram_train, feature_connotation_pct_train, 
#                     feature_vad_pct_train])
# feature_val = sparse.hstack([trigram_val, feature_connotation_pct_val, 
#                     feature_vad_pct_val])

# ============= using log counts of the feature ==========================
# feature_connotation_ln_val = get_connotation_ln_feature(document_val_side, matrix_connotation, unigram_vectorizer)
# feature_vad_ln_val = get_vad_ln_feature(document_val_side, matrix_vad, unigram_vectorizer)

# feature_val = sparse.hstack([trigram_val, feature_connotation_ln_val, 
#                     feature_vad_ln_val])

# ============= using percentage counts of the feature without neutral connotation ==========================
feature_connotation_pct_val = get_connotation_percentage_feature(document_val_side, matrix_connotation_no_neutral, unigram_vectorizer)
feature_vad_pct_val = get_vad_percentage_feature(document_val_side, matrix_vad, unigram_vectorizer)

feature_val = sparse.hstack([trigram_val, feature_connotation_pct_val, 
                    feature_vad_pct_val])

# Create model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(feature_train, y_train)

In [None]:
print(classification_report(y_train, clf.predict(feature_train)))
print(classification_report(y_val, clf.predict(feature_val)))

# Model 1 - Here is the model that only uses debate features and ngram features

In [None]:
# Extracting texts from training and testing data
label_train = get_winner(df_train)
label_val = get_winner(df_val)

# Generate the corpus 
document_train = get_text_by_side(df_train)
document_val = get_text_by_side(df_val)

# Vectorization
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.9, min_df=0.1, stop_words='english', ngram_range=(1,3))
vectorizer.fit(document_train)

# Getting two sets of features - ngram and debate related features
ngram_feature_train = get_ngram_feature(df_train, vectorizer)
ngram_feature_val = get_ngram_feature(df_val, vectorizer)

debate_feature_train = get_debate_feature(df_train)
debate_feture_val = get_debate_feature(df_val)

# Combining two sets of features
X_train = sparse.hstack([debate_feature_train, ngram_feature_train])
X_val = sparse.hstack([debate_feture_val, ngram_feature_val])

y_train = np.array(label_train)
y_val = np.array(label_val)

In [None]:
print('Sanity check')
print(df_train.shape[0], 'number of observations in the training set')
print(X_train.shape, 'number of observation x the size of ngram vectors in the training set')
print(y_train.shape, 'number of labels in the training set')
print(df_val.shape[0], 'number of observations in the validation set')
print(X_val.shape, 'number of observation x the size of ngram vectors in the validation set')
print(y_val.shape, 'number of labels in the validation set')


In [None]:
# Building and training the model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(ngram_feature_train, y_train)

print("Logistic Regression training set report:")
print(classification_report(y_train, clf.predict(ngram_feature_train), target_names=['Pro', 'Con']))
print(classification_report(y_val, clf.predict(ngram_feature_val), target_names=['Pro', 'Con']))

# %%

In [None]:
X_val

In [None]:
# Evaluating the model on the validation set
y_predicted = clf.predict(X_val_religion)
print("Logistic Regression testing set report:")
print(classification_report(y_val_religion, y_predicted, target_names=['Pro', 'Con']))

print("Accuracy score: ",accuracy_score(y_val_religion, y_predicted))
print("Balanced accuracy score: ",accuracy_score(y_val_religion, y_predicted))

plot_confusion_matrix(clf, X_val, y_predicted)

In [None]:
# Tuning ngram models over max_df and min_df
def search_max_df_min_df(df_train, df_val):
    highest_acc, best_min_df, best_max_df = 0, -1, -1
    report = {}
    for min_df in np.arange(0, 1, 0.1):
        for diff in np.arange(0.1, 1 - min_df, 0.1):
            max_df = min_df + diff

            vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=max_df, min_df=min_df, stop_words='english', ngram_range=(1,3))
            document_train = get_text_by_side(df_train)
            vectorizer.fit(document_train)
            X_train, y_train = get_all_feature_label(df_train, vectorizer)
            X_val, y_val = get_all_feature_label(df_val, vectorizer)

            clf = sklearn.linear_model.LogisticRegression()
            clf.fit(X_train, y_train)
            
            print("====================================")

            y_predicted = clf.predict(X_val)
            print("Logistic Regression testing set report:")
            report[(min_df, max_df)] = classification_report(y_val, y_predicted, target_names=['Pro', 'Con'], output_dict=True)
            acc = accuracy_score(y_val, y_predicted)

            print("max_df: {}, min_df: {}, accuracy: {}".format(max_df, min_df, acc))

            if acc > highest_acc:
                highest_acc, best_min_df, best_max_df = acc, min_df, max_df

    print("************ best min_df, best max_df, acc", best_min_df, best_max_df, highest_acc)
    return report



In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
max_acc, best_min_df, best_max_df = 0, -1, -1
gram3_report = report

for key, val in report.items():
    print("====================")
    print(key)
    print(val)

# The best min df and the best max df are (0.2, 0.8) with validation accuracy of 0.76





One way of achieving this is to create two n-gram models. One n-gram model outputs features
for religious topics and another n-gram model outputs features for non-religious topics.
By limiting the corpus within their topics, the Tf_idf scores may better reflect the 
proper weighting. For example, certain words that might only appear in winning relgious debates
but also appear in all other losing debates may now have a significantly different score from 
words that appear in only losing religous debates but appear in all other winning debates. 
Previously, these two sets of words would have similar tf_idf score but are not helpful 
towards predicting winning debates because their prediciton power within relgious topic is
diluted by the non-religous topics. By limiting the corpus scope, we can see that these 
words become helpful in both religous and non-relgious debates.

TODO:
1. Define a Tfidfvectorizer for both religous and non-religious topics
2. Train the vectorizer using their respective subsets
3. Depending the topic of the new data, we should use the two models conditionally

In [None]:
# Partition the data sets
df_train_religion = df_train.loc[df_train.category == "Religion" ,:]
df_train_other = df_train.loc[df_train.category != "Religion" ,:]
df_val_religion = df_val.loc[df_val.category == "Religion" ,:]
df_val_other = df_val.loc[df_val.category != "Religion" ,:]

In [None]:
print("Sanity check")
print(df_train_religious.shape)
print(df_train_other.shape)
print(df_train.shape)
print("validation set")
print(df_val_religious.shape)
print(df_val_other.shape)
print(df_val.shape)

In [None]:
print(X_train_religion.shape)
print(X_val_religion.shape)

In [None]:
search_max_df_min_df(df_train_religion, df_val_religion)
search_max_df_min_df(df_train_other, df_val_other)

In [None]:
# Set up the vectorizer
vectorizer_religion = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=0, stop_words='english', ngram_range=(1,3))
document_train_religion = get_text_by_side(df_train_religion)
vectorizer_religion.fit(document_train_religion)
X_train_religion, y_train_religion = get_all_feature_label(df_train_religion, vectorizer_religion)
X_val_religion, y_val_religion = get_all_feature_label(df_val_religion, vectorizer_religion)
report_religion = search_max_df_min_df(X_train_religion, y_train_religion, X_val_religion, y_val_religion)

vectorizer_other = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=0, stop_words='english', ngram_range=(1,3))
document_train_other = get_text_by_side(df_train_other)
vectorizer_other.fit(document_train_other)
X_train_other, y_train_other = get_all_feature_label(df_train_other, vectorizer_other)
X_val_other, y_val_other = get_all_feature_label(df_val_other, vectorizer_other)
report_other = search_max_df_min_df(X_train_other, y_train_other, X_val_other, y_val_other)