In [14]:
#%% 
import collections
from numpy.lib.function_base import vectorize
import pandas as pd 
import numpy as np
import json
from scipy import sparse
import sklearn.metrics
import sklearn.neighbors
import sklearn.linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# load example data
data_train = fetch_20newsgroups(subset='train', shuffle=True)
# this is just a list of strings
data_test = fetch_20newsgroups(subset='test', shuffle=True)

In [4]:
# %%
def get_jsonl(path):

    with open(path) as json_file:
        json_list = list(json_file)

    data_list = []
    for json_str in json_list:
        data_list.append(json.loads(json_str))

    return pd.DataFrame(data_list)

In [5]:
# %%
# loading user  data 
USER_DATA = './resources/data/users.json'
df_user = pd.read_json(USER_DATA, orient="index")

# loading training data .jsonl
TRAINING_DATA = './resources/data/train.jsonl'
VAL_DATA = './resources/data/val.jsonl'

df_train, df_val = get_jsonl(TRAINING_DATA), get_jsonl(VAL_DATA)

In [6]:
df_train.shape[0]

1592

In [7]:
s = set()
for v in df_train['voters']:
    s = s.union(v)
print(len(s))

2927


In [8]:
# %%
print(df_train.columns)
# df_train
# %%
# Explore the structure of rounds
one_round = df_train.loc[0, "rounds"] # this is a list of list of dictionary
two_sides = one_round[1] # this is a list consists of two sides speaking

# import json
# json.dumps(one_round, indent=4)

# one_round


Index(['id', 'category', 'title', 'rounds', 'date', 'pro_debater',
       'con_debater', 'voters', 'winner'],
      dtype='object')


In [9]:
def get_texts(df):
    '''
    Return a list of statements in df without differentiating the side of the speaker
    '''

    texts = []
    for round in df.loc[:, 'rounds']:
        for sub_round in round:
            for speech in sub_round:
                texts.append(speech['text'])

    return texts

def get_text_by_side(df): 
    '''
    Return a list of documents where each document contains all text on one side in a 
    single debate
    
    text = [[Pro statement 1, Pro statement 2, ... Pro statement n],
            [Con statement 1, Con statement 2, ... Con statement m]]
            where n, m is the total number of statements from Pro and Con side across
            all debates

    size: [n x 2 x # statements in each debate]
    '''

    text = []
    for round in df.loc[:, 'rounds']:
        round_text = collections.defaultdict(list)

        for sub_round in round:
            for speech in sub_round: 
                round_text[speech['side']].append(speech['text'])

        for speech in round_text.values():
            text.append("".join(speech))

    return text

def get_ngram_feature(df, vectorizer: TfidfVectorizer): 
    '''
    Return the ngram features associated with a single debate

    For pro side, each document is defined as a string that contains all the statements 
    from the pro side in a single debate (across different subrounds). Con side is 
    similarly defined. 

    return [[Pro side n gram vector, Con side n gram vector for 1 debate],
            [Pro side n gram vector, Con side n gram vector for 2 debate],
            ...]

            size: [n, 2 x ngram count]
    
    Pro side and con side n gram vector are concatenated.
    '''

    pro_text, con_text = [], []

    for round in df.loc[:, 'rounds']: 
        round_text_list = collections.defaultdict(list)
        round_feature = []

        for sub_round in round:
            for speech in sub_round: 
                round_text_list[speech['side']].append(speech['text'])

        round_text = {}
        for side, speech in round_text_list.items():
            one_side_text = "".join(speech)
            round_text[side] = one_side_text
            
        pro_text.append(round_text['Pro'])
        con_text.append(round_text['Con'])

    pro_feature = vectorizer.transform(pro_text)
    con_feature = vectorizer.transform(con_text)
    print("pro , con shape are", pro_feature.shape, con_feature.shape)
    return sparse.hstack([pro_feature, con_feature])   

def get_debate_feature(df):
    '''
    Return the debate feature such as category, pro_debator user name, etc

    feature: [n, # of features] 
    '''
    feature_name = ['category']
    feature = []

    for name in feature_name: 
        # TODO: check for data type of the column. If non-numeric, then do this
        # otherwise, use the numerical data
        encoding, unique_feature_val = pd.factorize(df[name])
        feature.append(encoding)

    return np.reshape(np.array(feature), [-1, len(feature_name)])

def get_winner(df): 
    '''
    Cons gets mapped to 0 and pro gets mapped to 1
    '''
    return df.loc[:, "winner"].replace({"Con": 0, "Pro": 1})

def get_all_feature_label(df, vectorizer):
    '''
    Return the training input and validation input that contains all features, 
    which are ngram features and debate features
    '''
    
    # Getting two sets of features - ngram and debate related features
    ngram_feature = get_ngram_feature(df, vectorizer)

    # debate_feature = get_debate_feature(df)

    # Combining two sets of features
    # X = sparse.hstack([debate_feature, ngram_feature])
    X = sparse.hstack([ngram_feature])

    y = np.array(get_winner(df))

    return X, y


# Model 2 - lex feature, debate feature, n-gram feature
This model should use
1. word ngrams
2. lexicon based features: implement lexicon based features for a lexicon of your choice
   1. Connotation lexicon
   2. NRC-VAD lexicon
   3. How you extract features is part of the desgin decision that you need to make. One simple example for lexical features could be counting how many words in each debaters language appear in the corresponding lexicon. 

TODO: 
1. Collect the connotation score documents by documents  

In [40]:
# 1. Read connotation - 1 file
# 2. NRC features - 2 files 
CONNOTATION = "./resources/lexica/connotation_lexicon_a.0.1.csv"
NRC_LEXICON_VAD = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt"
NRC_LEXICON_SORTED_VALENCE = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/v-scores.txt"
NRC_LEXICON_SORTED_AROUSAL = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/a-scores.txt"
NRC_LEXICON_SORTED_DOMINANCE = "./resources/lexica/NRC-VAD-Lexicon-Aug2018Release/OneFilePerDimension/d-scores.txt"

df_connotation = pd.read_csv(CONNOTATION, sep=",|_", header=None)
df_connotation.columns = ["word", "pos", "connotation"] # word, part of speech, connotation
df_connotation = df_connotation.dropna()
df_connotation = df_connotation.set_index("word")
df_nrc_vad = pd.read_csv(NRC_LEXICON_VAD, sep="	", header=None)
df_nrc_vad.columns = ["word", "valence", "arousal", "dominance"]
df_nrc_vad = df_nrc_vad.set_index("word")



  if __name__ == '__main__':


In [18]:
# Extracting texts from training and testing data
label_train = get_winner(df_train)
label_val = get_winner(df_val)

# Generate the corpus 
document_train = get_text_by_side(df_train)
document_test = get_text_by_side(df_val)

# Get the feature scores
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_train = unigram_vectorizer.fit_transform(document_train)
print(unigram_train.shape)





(3185, 57063)


In [42]:
# Construct the connotation matrix where each column list the valence / a / d for each word
connotation_matrix = np.zeros(shape=[unigram_train.shape[1], 2])
# print(df_connotation.iloc[, :])
connotation_words = df_connotation.index
unigram_vectorizer.transform(connotation_words)


False


<93869x57063 sparse matrix of type '<class 'numpy.int64'>'
	with 41660 stored elements in Compressed Sparse Row format>

# Model 1 - Here is the model that only uses debate features and ngram features

In [84]:
# Extracting texts from training and testing data
label_train = get_winner(df_train)
label_val = get_winner(df_val)

# Generate the corpus 
document_train = get_text_by_side(df_train)
document_test = get_text_by_side(df_val)

# Vectorization
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.9, min_df=0.1, stop_words='english', ngram_range=(1,3))
vectorizer.fit(document_train)

# Getting two sets of features - ngram and debate related features
ngram_feature_train = get_ngram_feature(df_train, vectorizer)
ngram_feature_val = get_ngram_feature(df_val, vectorizer)

debate_feature_train = get_debate_feature(df_train)
debate_feture_val = get_debate_feature(df_val)

# Combining two sets of features
X_train = sparse.hstack([debate_feature_train, ngram_feature_train])
X_val = sparse.hstack([debate_feture_val, ngram_feature_val])

y_train = np.array(label_train)
y_val = np.array(label_val)

pro , con shape are (1592, 606) (1592, 606)
pro , con shape are (399, 606) (399, 606)


In [86]:
print('Sanity check')
print(df_train.shape[0], 'number of observations in the training set')
print(X_train.shape, 'number of observation x the size of ngram vectors in the training set')
print(y_train.shape, 'number of labels in the training set')
print(df_val.shape[0], 'number of observations in the validation set')
print(X_val.shape, 'number of observation x the size of ngram vectors in the validation set')
print(y_val.shape, 'number of labels in the validation set')


Sanity check
1592 number of observations in the training set
(1592, 1213) number of observation x the size of ngram vectors in the training set
(1592,) number of labels in the training set
399 number of observations in the validation set
(399, 1213) number of observation x the size of ngram vectors in the validation set
(399,) number of labels in the validation set


In [94]:
# Building and training the model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(ngram_feature_train, y_train)

print("Logistic Regression training set report:")
print(classification_report(y_train, clf.predict(ngram_feature_train), target_names=['Pro', 'Con']))
print(classification_report(y_val, clf.predict(ngram_feature_val), target_names=['Pro', 'Con']))

# %%

Logistic Regression training set report:
              precision    recall  f1-score   support

         Pro       0.88      0.94      0.91       916
         Con       0.91      0.83      0.87       676

    accuracy                           0.89      1592
   macro avg       0.90      0.88      0.89      1592
weighted avg       0.89      0.89      0.89      1592

              precision    recall  f1-score   support

         Pro       0.72      0.86      0.78       211
         Con       0.80      0.62      0.70       188

    accuracy                           0.75       399
   macro avg       0.76      0.74      0.74       399
weighted avg       0.76      0.75      0.74       399



In [89]:
X_val

<399x1213 sparse matrix of type '<class 'numpy.float64'>'
	with 96112 stored elements in COOrdinate format>

In [91]:
# Evaluating the model on the validation set
y_predicted = clf.predict(X_val_religion)
print("Logistic Regression testing set report:")
print(classification_report(y_val_religion, y_predicted, target_names=['Pro', 'Con']))

print("Accuracy score: ",accuracy_score(y_val_religion, y_predicted))
print("Balanced accuracy score: ",accuracy_score(y_val_religion, y_predicted))

plot_confusion_matrix(clf, X_val, y_predicted)

ValueError: X has 1682716 features per sample; expecting 1213

In [82]:
# Tuning ngram models over max_df and min_df
def search_max_df_min_df(df_train, df_val):
    highest_acc, best_min_df, best_max_df = 0, -1, -1
    report = {}
    for min_df in np.arange(0, 1, 0.1):
        for diff in np.arange(0.1, 1 - min_df, 0.1):
            max_df = min_df + diff

            vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=max_df, min_df=min_df, stop_words='english', ngram_range=(1,3))
            document_train = get_text_by_side(df_train)
            vectorizer.fit(document_train)
            X_train, y_train = get_all_feature_label(df_train, vectorizer)
            X_val, y_val = get_all_feature_label(df_val, vectorizer)

            clf = sklearn.linear_model.LogisticRegression()
            clf.fit(X_train, y_train)
            
            print("====================================")

            y_predicted = clf.predict(X_val)
            print("Logistic Regression testing set report:")
            report[(min_df, max_df)] = classification_report(y_val, y_predicted, target_names=['Pro', 'Con'], output_dict=True)
            acc = accuracy_score(y_val, y_predicted)

            print("max_df: {}, min_df: {}, accuracy: {}".format(max_df, min_df, acc))

            if acc > highest_acc:
                highest_acc, best_min_df, best_max_df = acc, min_df, max_df

    print("************ best min_df, best max_df, acc", best_min_df, best_max_df, highest_acc)
    return report



In [38]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
max_acc, best_min_df, best_max_df = 0, -1, -1
gram3_report = report

for key, val in report.items():
    print("====================")
    print(key)
    print(val)

# The best min df and the best max df are (0.2, 0.8) with validation accuracy of 0.76





(0.0, 0.1)
              precision    recall  f1-score   support

         Pro       0.54      1.00      0.70       211
         Con       1.00      0.06      0.11       188

    accuracy                           0.56       399
   macro avg       0.77      0.53      0.41       399
weighted avg       0.76      0.56      0.42       399

(0.0, 0.2)
              precision    recall  f1-score   support

         Pro       0.57      1.00      0.72       211
         Con       1.00      0.14      0.25       188

    accuracy                           0.60       399
   macro avg       0.78      0.57      0.49       399
weighted avg       0.77      0.60      0.50       399

(0.0, 0.30000000000000004)
              precision    recall  f1-score   support

         Pro       0.57      1.00      0.73       211
         Con       1.00      0.15      0.26       188

    accuracy                           0.60       399
   macro avg       0.78      0.57      0.49       399
weighted avg       0.77  

One way of achieving this is to create two n-gram models. One n-gram model outputs features
for religious topics and another n-gram model outputs features for non-religious topics.
By limiting the corpus within their topics, the Tf_idf scores may better reflect the 
proper weighting. For example, certain words that might only appear in winning relgious debates
but also appear in all other losing debates may now have a significantly different score from 
words that appear in only losing religous debates but appear in all other winning debates. 
Previously, these two sets of words would have similar tf_idf score but are not helpful 
towards predicting winning debates because their prediciton power within relgious topic is
diluted by the non-religous topics. By limiting the corpus scope, we can see that these 
words become helpful in both religous and non-relgious debates.

TODO:
1. Define a Tfidfvectorizer for both religous and non-religious topics
2. Train the vectorizer using their respective subsets
3. Depending the topic of the new data, we should use the two models conditionally

In [63]:
# Partition the data sets
df_train_religion = df_train.loc[df_train.category == "Religion" ,:]
df_train_other = df_train.loc[df_train.category != "Religion" ,:]
df_val_religion = df_val.loc[df_val.category == "Religion" ,:]
df_val_other = df_val.loc[df_val.category != "Religion" ,:]

In [64]:
print("Sanity check")
print(df_train_religious.shape)
print(df_train_other.shape)
print(df_train.shape)
print("validation set")
print(df_val_religious.shape)
print(df_val_other.shape)
print(df_val.shape)

Sanity check
(370, 9)
(1222, 9)
(1592, 9)
validation set
(93, 9)
(306, 9)
(399, 9)


In [69]:
print(X_train_religion.shape)
print(X_val_religion.shape)

(370, 607)
(93, 607)


In [83]:
search_max_df_min_df(df_train_religion, df_val_religion)
search_max_df_min_df(df_train_other, df_val_other)

pro , con shape are (370, 840554) (370, 840554)
pro , con shape are (93, 840554) (93, 840554)
Logistic Regression testing set report:
max_df: 0.1, min_df: 0.0, accuracy: 0.4946236559139785
pro , con shape are (370, 841057) (370, 841057)
pro , con shape are (93, 841057) (93, 841057)
Logistic Regression testing set report:
max_df: 0.2, min_df: 0.0, accuracy: 0.4946236559139785
pro , con shape are (370, 841222) (370, 841222)
pro , con shape are (93, 841222) (93, 841222)
Logistic Regression testing set report:
max_df: 0.30000000000000004, min_df: 0.0, accuracy: 0.4946236559139785
pro , con shape are (370, 841289) (370, 841289)
pro , con shape are (93, 841289) (93, 841289)
Logistic Regression testing set report:
max_df: 0.4, min_df: 0.0, accuracy: 0.4946236559139785
pro , con shape are (370, 841325) (370, 841325)
pro , con shape are (93, 841325) (93, 841325)
Logistic Regression testing set report:
max_df: 0.5, min_df: 0.0, accuracy: 0.4946236559139785
pro , con shape are (370, 841343) (370,

ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df.

In [81]:
# Set up the vectorizer
vectorizer_religion = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=0, stop_words='english', ngram_range=(1,3))
document_train_religion = get_text_by_side(df_train_religion)
vectorizer_religion.fit(document_train_religion)
X_train_religion, y_train_religion = get_all_feature_label(df_train_religion, vectorizer_religion)
X_val_religion, y_val_religion = get_all_feature_label(df_val_religion, vectorizer_religion)
report_religion = search_max_df_min_df(X_train_religion, y_train_religion, X_val_religion, y_val_religion)

vectorizer_other = TfidfVectorizer(sublinear_tf=True, max_df=0.8, min_df=0, stop_words='english', ngram_range=(1,3))
document_train_other = get_text_by_side(df_train_other)
vectorizer_other.fit(document_train_other)
X_train_other, y_train_other = get_all_feature_label(df_train_other, vectorizer_other)
X_val_other, y_val_other = get_all_feature_label(df_val_other, vectorizer_other)
report_other = search_max_df_min_df(X_train_other, y_train_other, X_val_other, y_val_other)

pro , con shape are (370, 841358) (370, 841358)
pro , con shape are (93, 841358) (93, 841358)
Logistic Regression testing set report:
max_df: 0.1, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.2, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.30000000000000004, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.4, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.5, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.6, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.7000000000000001, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.8, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression testing set report:
max_df: 0.9, min_df: 0.0, accuracy: 0.4946236559139785
Logistic Regression 

KeyboardInterrupt: 