# Sentiment Analysis Model - Threat Detector
## Python 401d15 - 01/22/2021
### By : Hexx King, Lee Thomas, Taylor Johnson and Ryan Pilon

## TRIGGER WARNING! Offensive language and hate speech is visible below.

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Import the training data and inspect it

In [18]:
# In the `read_csv` function, we have passed a parameter for *encoding*, because our data set contains non-english words that's not supported by the default pandas `read_csv` function. 

dataset = pd.read_csv('./FinalBalancedDataset.csv', encoding='ISO-8859-1')

# Pulling out only the columns we want in the dataset

dt_transformed = dataset[['Toxicity', 'tweet']]
dt_transformed.head()


Unnamed: 0,Toxicity,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


# Cleaning the labeled data

In [4]:
#remove user names by pulling all the characters inbetween "@" and ":"
#removes hashtags and their text
#removes text starting with http
#removes the "RT"

import re

def remove_RT_user(text):
    tweet = re.sub("@[^\s]+", "", text)
    hashtag = re.sub("#[\w|\d]+", "", tweet)
    remove_http = re.sub("(https?[a-zA-Z0-9]+)|(http?[a-zA-Z0-9]+)", "", hashtag)
    no_rt = re.sub("RT", "", remove_http)
    return no_rt

dt_transformed['tweet_wo_RT_username'] = dt_transformed['tweet'].apply(lambda x: remove_RT_user(x))
dt_transformed.head()

Unnamed: 0,Toxicity,tweet,tweet_wo_RT_username
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can't use cause they do...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in urÃ°Â...
4,0,factsguide: society now #motivation,factsguide: society now


In [5]:
# removing punctuation

import string
print(string.punctuation)

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

dt_transformed['tweet_wo_RT_username_punct'] = dt_transformed['tweet_wo_RT_username'].apply(lambda x: remove_punctuation(x))
dt_transformed.head()


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,Toxicity,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so sel...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can't use cause they do...,thanks for credit i cant use cause they don...
2,0,bihday your majesty,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in urÃ°Â...,i love u take with u all the time in urÃ°Â...
4,0,factsguide: society now #motivation,factsguide: society now,factsguide society now


In [6]:
# Tokenization = splitting strings into words

def tokenize(text):
    split = re.split("\W+", text)
    return split

dt_transformed['tweet_wo_RT_username_punct_split'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: tokenize(x))
dt_transformed.head()

Unnamed: 0,Toxicity,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so sel...,"[, when, a, father, is, dysfunctional, and, is..."
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can't use cause they do...,thanks for credit i cant use cause they don...,"[, thanks, for, credit, i, cant, use, cause, t..."
2,0,bihday your majesty,bihday your majesty,bihday your majesty,"[, bihday, your, majesty]"
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in urÃ°Â...,i love u take with u all the time in urÃ°Â...,"[, i, love, u, take, with, u, all, the, time, ..."
4,0,factsguide: society now #motivation,factsguide: society now,factsguide society now,"[, factsguide, society, now, ]"


In [7]:
# Loading in and looking at the stopwords to be removed from the tweets

from nltk.corpus import stopwords
nltk.download('stopwords')

stopWords = set(stopwords.words('english'))
print(stopWords)

{"hasn't", 'be', 'wasn', 'same', 'so', 'doesn', "shan't", "you'll", 'did', 'am', 'through', "haven't", 'having', 'until', 'few', 'at', 'couldn', 'aren', 'out', 'm', 'y', 'then', 'if', 'an', "it's", 's', 'all', "shouldn't", "isn't", 'and', 'hadn', 'doing', 'but', 'her', 'of', "weren't", 'will', 'don', 'won', 're', 'further', 'such', "don't", "didn't", 'in', 'you', 'do', 'does', 'above', 'hasn', 'its', "you're", 'where', 'nor', 'ain', 'ourselves', 'their', 'each', 'more', 'themselves', 'other', 'o', 'what', 'the', 'i', 'up', 'for', "mustn't", 'now', 'should', "should've", 'most', 'about', 'or', 'he', 'with', 'below', "that'll", 'from', 'mustn', 'weren', 'just', 'into', "needn't", "you'd", 'as', 'why', 'both', 'some', "you've", 'yourself', "wasn't", 'yourselves', 'here', 'than', 'his', 'haven', "doesn't", 'before', 'yours', 'over', 'were', 'after', 'only', 'we', 'these', 'was', 'isn', 'down', 'hers', 'too', 'ma', 'this', 've', 'there', 'my', 'can', 'she', 'shan', 'it', 'when', 'because', 

In [8]:
# Removing the stopwords

def remove_stopwords(text):
    text = [word for word in text if word not in stopWords]
    return text

dt_transformed['tweet_wo_RT_username_punct_split_stopwords'] = dt_transformed['tweet_wo_RT_username_punct_split'].apply(lambda x: remove_stopwords(x))
dt_transformed.head()

Unnamed: 0,Toxicity,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split,tweet_wo_RT_username_punct_split_stopwords
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...,when a father is dysfunctional and is so sel...,"[, when, a, father, is, dysfunctional, and, is...","[, father, dysfunctional, selfish, drags, kids..."
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i can't use cause they do...,thanks for credit i cant use cause they don...,"[, thanks, for, credit, i, cant, use, cause, t...","[, thanks, credit, cant, use, cause, dont, off..."
2,0,bihday your majesty,bihday your majesty,bihday your majesty,"[, bihday, your, majesty]","[, bihday, majesty]"
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in urÃ°Â...,i love u take with u all the time in urÃ°Â...,"[, i, love, u, take, with, u, all, the, time, ...","[, love, u, take, u, time, urÃ, Â, Â, Â, Ã, Â,..."
4,0,factsguide: society now #motivation,factsguide: society now,factsguide society now,"[, factsguide, society, now, ]","[, factsguide, society, ]"


In [None]:
stem = nltk.

# Creating the Bag of Words

In [9]:

# importing the CountVectorizer to "vectorize" sentences by creating a collection of unique words and assigning an index to each one 

tweets = dt_transformed['tweet_wo_RT_username_punct_split']

# `explode()` produces the same as `tweet_list = [item for sublist in tweets for item in sublist]`
tweet_list = tweets.explode()

vectorizer = CountVectorizer(max_features=None) 
# `max_features=n` builds a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

vectorizer.fit_transform(tweet_list)
# fit_transform is equivalent to fit followed by transform, and returns a document-term matrix.

# A mapping of terms to feature indices.
result = vectorizer.vocabulary_

print("We have ", len(result), " words in our Bag of Words")

We have  39771  words in our Bag of Words


In [10]:
# transforming into feature vectors for the learning model

vectorizer.fit_transform(tweet_list).toarray()
# `fit_transform` learns a list of feature name -> indices mappings 

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Splitting the Data into a Training Set and a Testing Set to grade the accuracy of our model

In [11]:

# Split the data into testing and training sets

tweet_text = tweets.values

y = dt_transformed['Toxicity'].values

tweet_text_train, tweet_text_test, y_train, y_test = train_test_split(tweet_text, y, test_size=0.33, random_state=0, stratify=y)
# random_state shuffles the data so that we don't accidently end up with biased data
# stratify to help keep the proportion of y values through the training and test sets

# checking the length to ensure that my samples sizes are the same
print("length of y_train:", len(y_train))
print("length of tweet_text_train:", len(tweet_text_train)) 

length of y_train: 38019
length of tweet_text_train: 38019


In [12]:
# creating  the feature vectors in the training set and testing set.

tweet_text_train = [inner[0] for inner in tweet_text_train]
tweet_text_test = [inner[0] for inner in tweet_text_test]

X_train = vectorizer.transform(tweet_text_train)
X_test  = vectorizer.transform(tweet_text_test)
X_train

# we have compressed the vectorized data of 6631 elements into a format that takes up less space

<38019x39771 sparse matrix of type '<class 'numpy.int64'>'
	with 18000 stored elements in Compressed Sparse Row format>

In [13]:

# LogisticRegression gives our training model a grade based off it's performance on the testing set

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Score : ", str(round(score * 100)) + "%")

Score :  61%
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
import pickle

pickle_file = 'finalized_model.pkl'
# saving the model to a pickled file to be copied into the back-end repo
s = pickle.dumps(classifier)
with open(pickle_file, "wb") as file:
    file.write(s)

In [15]:
vectorizer_file = 'vectorizer_pickle.pkl'
s = pickle.dumps(vectorizer)
with open(vectorizer_file, "wb") as file:
    file.write(s)

In [16]:
# testing the pickled file 
with open(pickle_file, "rb") as file:
    Pickled_Classifier = pickle.load(file)

Pickled_Classifier

LogisticRegression()

In [17]:
with open(vectorizer_file, "rb") as file:
    Pickled_vectorizer = pickle.load(file)

Pickled_vectorizer

CountVectorizer()