# Sentiment Analysis Model - Threat Detector
## Python 401d15 - 01/22/2021
### By : Hexx King, Lee Thomas, Taylor Johnson and Ryan Pilon

## TRIGGER WARNING! Offensive language and hate speech is visible below.

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### In the `read_csv` function, we have passed a parameter for *encoding*, because our data set contains non-english words that's not supported by the default pandas `read_csv` function. 

In [2]:
dataset = pd.read_csv('./labeled_data.csv', encoding='ISO-8859-1')
dataset.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [4]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [5]:
dt_transformed = dataset[['class', 'tweet']]
dt_transformed.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Cleaning the labeled data

In [6]:
#remove user names by pulling all the characters inbetween "@" and ":"
#removes hashtags and their text
#removes text starting with http
#removes the "RT"

import re

def remove_RT_user(text):
    tweet = re.sub("@[^\s]+", "", text)
    # hashtag = re.sub("#[^\s]+", "", tweet)
    hashtag = re.sub("#[\w|\d]+", "", tweet)
    remove_http = re.sub("(https?[a-zA-Z0-9]+)|(http?[a-zA-Z0-9]+)", "", hashtag)
    no_rt = re.sub("RT", "", remove_http)
    return no_rt

dt_transformed['tweet_wo_RT_username'] = dt_transformed['tweet'].apply(lambda x: remove_RT_user(x))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...


In [7]:
# removing punctuation

import string
print(string.punctuation)

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

dt_transformed['tweet_wo_RT_username_punct'] = dt_transformed['tweet_wo_RT_username'].apply(lambda x: remove_punctuation(x))
dt_transformed.head()


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...


In [8]:
# stopword = nltk.corpus.stopwords.words('english')
# print(stopword[:11])

# def remove_stopwords(text):
#     text = [word for word in text if word not in stopword]
#     return text

# dt_transformed['tweet_wo_RT_username_punct_stopwords'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: remove_stopwords(x))
# dt_transformed.head()

In [9]:
# Tokenization = splitting strings into words

def tokenize(text):
    split = re.split("\W+", text)
    return split

dt_transformed['tweet_wo_RT_username_punct_split'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: tokenize(x))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...,"[, As, a, woman, you, shouldnt, complain, abou..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...,"[, boy, dats, coldtyga, dwn, bad, for, cuffin,..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...,"[, Dawg, You, ever, fuck, a, bitch, and, she, ..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny,"[, she, look, like, a, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...,"[, The, shit, you, hear, about, me, might, be,..."


# Creating the Bag of Words

In [10]:
tweets = dt_transformed[['tweet']]
type(tweets)

pandas.core.frame.DataFrame

In [11]:
# Sliced this output down to 4 items
[tweet for tweet in dt_transformed['tweet_wo_RT_username_punct_split'][:4]]

[['',
  'As',
  'a',
  'woman',
  'you',
  'shouldnt',
  'complain',
  'about',
  'cleaning',
  'up',
  'your',
  'house',
  'amp',
  'as',
  'a',
  'man',
  'you',
  'should',
  'always',
  'take',
  'the',
  'trash',
  'out'],
 ['',
  'boy',
  'dats',
  'coldtyga',
  'dwn',
  'bad',
  'for',
  'cuffin',
  'dat',
  'hoe',
  'in',
  'the',
  '1st',
  'place'],
 ['',
  'Dawg',
  'You',
  'ever',
  'fuck',
  'a',
  'bitch',
  'and',
  'she',
  'start',
  'to',
  'cry',
  'You',
  'be',
  'confused',
  'as',
  'shit'],
 ['', 'she', 'look', 'like', 'a', 'tranny']]

In [12]:

# importing the CountVectorizer to "vectorize" sentences by creating a collection of unique words and assigning an index to each one 

tweets = dt_transformed['tweet_wo_RT_username_punct_split']

# `explode()` produces the same as `tweet_list = [item for sublist in tweets for item in sublist]`
tweet_list = tweets.explode()

vectorizer = CountVectorizer()
#Sliced this output down to 50 items
vectorizer.fit(tweet_list[:50])
vectorizer.vocabulary_


{'as': 5,
 'woman': 36,
 'you': 37,
 'shouldnt': 29,
 'complain': 11,
 'about': 1,
 'cleaning': 9,
 'up': 35,
 'your': 38,
 'house': 22,
 'amp': 3,
 'man': 24,
 'should': 28,
 'always': 2,
 'take': 31,
 'the': 32,
 'trash': 34,
 'out': 25,
 'boy': 8,
 'dats': 15,
 'coldtyga': 10,
 'dwn': 17,
 'bad': 6,
 'for': 19,
 'cuffin': 13,
 'dat': 14,
 'hoe': 21,
 'in': 23,
 '1st': 0,
 'place': 26,
 'dawg': 16,
 'ever': 18,
 'fuck': 20,
 'bitch': 7,
 'and': 4,
 'she': 27,
 'start': 30,
 'to': 33,
 'cry': 12}

In [13]:
# transforming into feature vectors for the learning model

# sliced this output down to 10 items 
vectorizer.transform(tweet_list[:10]).toarray()


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [14]:

# Split the data into testing and training sets

tweet_text = tweets.values
print(tweet_text) # checking to make sure this is the value I want
y = dt_transformed['class'].values
print(y) # checking to make sure this is the value I want

tweet_text_train, tweet_text_test, y_train, y_test = train_test_split(tweet_text, y, test_size=0.25, random_state=1000)
# random_state shuffles the data so that we don't accidently end up with biased data


[list(['', 'As', 'a', 'woman', 'you', 'shouldnt', 'complain', 'about', 'cleaning', 'up', 'your', 'house', 'amp', 'as', 'a', 'man', 'you', 'should', 'always', 'take', 'the', 'trash', 'out'])
 list(['', 'boy', 'dats', 'coldtyga', 'dwn', 'bad', 'for', 'cuffin', 'dat', 'hoe', 'in', 'the', '1st', 'place'])
 list(['', 'Dawg', 'You', 'ever', 'fuck', 'a', 'bitch', 'and', 'she', 'start', 'to', 'cry', 'You', 'be', 'confused', 'as', 'shit'])
 ...
 list(['young', 'buck', 'wanna', 'eat', 'dat', 'nigguh', 'like', 'I', 'aint', 'fuckin', 'dis', 'up', 'again'])
 list(['youu', 'got', 'wild', 'bitches', 'tellin', 'you', 'lies'])
 list(['Ruffled', 'Ntac', 'Eileen', 'Dahlia', 'Beautiful', 'color', 'combination', 'of', 'pink', 'orange', 'yellow', 'amp', 'white', 'A', 'Coll', 'tcoH0dYEBvnZB'])]
[2 1 1 ... 1 1 2]


In [15]:
# creating  the feature vectors in the training set and testing set.

X_train = [item for sublist in tweet_text_train for item in sublist]
X_test = [item for sublist in tweet_text_test for item in sublist]

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)
X_train

<247880x19848 sparse matrix of type '<class 'numpy.int64'>'
	with 219266 stored elements in Compressed Sparse Row format>

In [16]:

# LogisticRegression gives our training model a grade based off it's performance on the testing set

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

ValueError: Found input variables with inconsistent numbers of samples: [247880, 18587]