-
Notifications
You must be signed in to change notification settings - Fork 1
/
chat_classification.py
127 lines (100 loc) · 4.86 KB
/
chat_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import streamlit as st
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from NBmodel import get_prepared_csvs, get_trump_biden_data, get_train_data, text_process
import pandas_read_xml as pdx
from cam_chat_classification import add_features, construct_bag_of_words, \
get_batch_features, featurize_msg, prepare_msg, \
prepare_one_msg
from PIL import Image
from sklearn.linear_model import Perceptron
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import OrderedDict
from sklearn.metrics import confusion_matrix
st.set_option('deprecation.showfileUploaderEncoding', False)
st.title("Machines Learning Speech Patterns")
st.header("By Cameron Abbot and Abraham Holleran")
st.write("Everyone has idiosyncrasies in their speech. A computer can quantify these and learn our speech patterns. \
In this app, we use a Naive Bayes classifier to train the computer to predict who is more likely to say any given sentence. \
You can run this on Android text data with you and a friend. Simply download the \"SMS Backup & Restore\" app and\
download the messages as an xml file.")
st.write("Please input either your system's path to the xml file or the file's Google Drive ID. \
For Google Drive, upload the xml file, turn link sharing on and paste in the ID from the link: drive.google.com/file/d/ID_is_found_here/view?usp=sharing")
xml_path = st.text_input("System path or Google Drive ID", "")
def tb_main():
model = MultinomialNB()
tb_im = Image.open('Trump-or-Biden.jpg')
st.image(tb_im, caption='Commander in Chief', use_column_width=True)
st.header("Classification of speech from the 2016 Presidential Debates")
st.write("Type a message and see if it's more likely to be said by Trump or Biden.")
user_tb_text = st.text_input("Text to classify goes here:", "Will you shut up, man")
#Classify as Trump/Biden by removing all the welker, wallace stuff.
first_debate, second_debate = get_prepared_csvs()
trump_biden_data = get_trump_biden_data(second_debate, first_debate)
X, y = get_train_data(trump_biden_data)
transformer = CountVectorizer(analyzer=text_process).fit(X)
all_train = transformer.transform(X)
model = model.fit(all_train, y)
user_tb_text = pd.Series(user_tb_text)
user_transformed = transformer.transform(user_tb_text)
pred_speaker = model.predict(user_transformed)
if pred_speaker[0] == 0:
pred_speaker = "Biden"
elif pred_speaker[0] == 1:
pred_speaker = "Trump"
st.write(f"With >80% accuracy on the test data, we predict that the message was said by {pred_speaker}.")
# Prevents running errors before user has entered data file URL
if(xml_path == ""):
st.write("No data entered.")
tb_main()
st.stop()
try:
df_sms = pdx.read_xml("https://drive.google.com/uc?id=" + xml_path, encoding="utf8")
name, df = add_features(df_sms)
except:
df_sms = pdx.read_xml(xml_path, encoding="utf8")
name, df = add_features(df_sms)
df['person'] = df['person'].astype(int)
first = df['person'][0]
countt = [0,0]
who_spoke = [[],[]]
for i in range(len(df['person'])):
if df['person'][i] == first:
countt[0] += 1
who_spoke[0].append(i)
else:
countt[1] += 1
who_spoke[1].append(i)
size_limit = min(countt)
df['author'] = np.where(df['person'] == 2, 'me', name)
df['body'] = df['body'].astype(str)
df['body'] = df['body'].str.lower()
who_spoke = [who_spoke[0][:size_limit], who_spoke[1][:size_limit]]
loca = who_spoke[0] + who_spoke[1]
more_data = df.iloc[loca]
bag_of_words = construct_bag_of_words(more_data['body'])
batch_features = get_batch_features(more_data['body'], bag_of_words)
X_train, X_test, y_train, y_test = \
train_test_split(batch_features, more_data['person'], test_size=0.2, random_state=42)
clf = Perceptron(n_iter_no_change=50)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
st.write(f"After training, we have an accuracy report. In the following confusion matrix, \
{name} is 0 and you are 1. The columns are what's predicted, and the rows are what's true.\
\n", confusion_matrix(y_test.tolist(), pred), '\n')
user_text = st.text_input("Text to classify goes here:", "I like math, \
but fractions is where I draw the line.")
test_features = prepare_one_msg(user_text, bag_of_words)
potential_author = clf.predict(test_features)
if potential_author[0] == 1:
potential_author = name
else:
potential_author = 'you'
st.write(f"Our accuracy on the test data was {round(score, 2)}%. \
The computer predicts that your message was said by {potential_author}.")
tb_main()