# Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd

In [4]:
import re

In [5]:
import numpy as np

In [6]:
from keras.utils.np_utils import to_categorical

ModuleNotFoundError: No module named 'keras'

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, accuracy_score, plot_confusion_matrix

# Data Reading

In [7]:
df = pd.read_csv('us_election_2020_1st_presidential_debate.csv').drop('minute', axis=1)

# Test processing & Cleaning

In [48]:
def read_clean(x):
    x = x.lower()
    x = x.replace("'",'').replace('"', '')
    x = re.sub(r'[^\w]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x

In [49]:
df['text'] = df['text'].apply(read_clean)

# Target Counts

In [50]:
df['speaker'] = df['speaker'].apply(lambda x: x.replace(':', ''))

In [51]:
df['speaker'].value_counts()

President Donald J. Trump    314
Vice President Joe Biden     249
Chris Wallace                226
Name: speaker, dtype: int64

# Down sample to have same value counts

In [110]:
cleaned_dfs = []

for val in df['speaker'].unique():
    cleaned_dfs.append(df[df['speaker']==val].sample(226))

sampled_df = pd.concat(cleaned_dfs).reset_index(drop=True)

In [111]:
sampled_df['speaker'].value_counts()

Chris Wallace                226
President Donald J. Trump    226
Vice President Joe Biden     226
Name: speaker, dtype: int64

In [112]:
unique_speakers = list(sampled_df['speaker'].unique())

In [172]:
y = to_categorical(sampled_df['speaker'].apply(lambda x : unique_speakers.index(x)))

# Train Test Split

In [173]:
train_x, test_x, train_y, test_y = train_test_split(sampled_df['text'], y, test_size=0.1)

# Feature Engineering

In [174]:
tf_model = TfidfVectorizer(min_df=0.01).fit(train_x)

In [175]:
train_x = tf_model.transform(train_x).toarray()
test_x = tf_model.transform(test_x).toarray()

# Model Creation

In [176]:
train_x.shape

(610, 257)

In [185]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [186]:
model = Sequential()
model.add(Dense(20, input_dim=train_x.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(20, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [187]:
model.fit(train_x, train_y, batch_size=1, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a16a86f988>

In [194]:
preds = model.predict(test_x).argmax(axis=1)

In [195]:
preds

array([2, 1, 0, 2, 0, 0, 0, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 0, 1,
       2, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2, 2, 1,
       2, 0, 0, 2, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 2, 1, 2, 2, 0, 1, 1, 1,
       1, 1], dtype=int64)

# Test Scores

In [208]:
accuracy_score(test_y.argmax(axis=1), preds)

0.6911764705882353

In [209]:
f1_score(test_y.argmax(axis=1), preds, average='micro')

0.6911764705882353

In [214]:
cm = pd.DataFrame(confusion_matrix(test_y.argmax(axis=1), preds), columns = unique_speakers, index= unique_speakers)
cm.columns.name = "Predicted"
cm.index.name = "Actual"

In [215]:
cm

Predicted,Chris Wallace,Vice President Joe Biden,President Donald J. Trump
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chris Wallace,17,2,2
Vice President Joe Biden,1,17,5
President Donald J. Trump,1,10,13
