In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

### In the `read_csv` function, we have passed a parameter for *encoding*, because our data set contains non-english words that's not supported by the default pandas `read_csv` function. 

In [2]:
dataset = pd.read_csv('./labeled_data.csv', encoding='ISO-8859-1')
dataset.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [4]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [5]:
dt_transformed = dataset[['class', 'tweet']]
dt_transformed.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Cleaning the labeled data

In [6]:
#remove user names by pulling all the characters inbetween "@" and ":"
#removes hashtags and their text
#removes text starting with http
#removes the "RT"

import re

def remove_RT_user(text):
    tweet = re.sub("@[^\s]+", "", text)
    # hashtag = re.sub("#[^\s]+", "", tweet)
    hashtag = re.sub("#[\w|\d]+", "", tweet)
    remove_http = re.sub("(https?[a-zA-Z0-9]+)|(http?[a-zA-Z0-9]+)", "", hashtag)
    no_rt = re.sub("RT", "", remove_http)
    return no_rt

dt_transformed['tweet_wo_RT_username'] = dt_transformed['tweet'].apply(lambda x: remove_RT_user(x))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...


In [7]:
# removing punctuation

import string
print(string.punctuation)

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

dt_transformed['tweet_wo_RT_username_punct'] = dt_transformed['tweet_wo_RT_username'].apply(lambda x: remove_punctuation(x))
dt_transformed.head()


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...


In [8]:
# stopword = nltk.corpus.stopwords.words('english')
# print(stopword[:11])

# def remove_stopwords(text):
#     text = [word for word in text if word not in stopword]
#     return text

# dt_transformed['tweet_wo_RT_username_punct_stopwords'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: remove_stopwords(x))
# dt_transformed.head()

In [9]:
# Tokenization = splitting strings into words

def tokenize(text):
    split = re.split("\W+", text)
    return split

dt_transformed['tweet_wo_RT_username_punct_split'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: tokenize(x))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...,"[, As, a, woman, you, shouldnt, complain, abou..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...,"[, boy, dats, coldtyga, dwn, bad, for, cuffin,..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...,"[, Dawg, You, ever, fuck, a, bitch, and, she, ..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny,"[, she, look, like, a, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...,"[, The, shit, you, hear, about, me, might, be,..."


In [10]:
# Bag of Words
tweets = dt_transformed['tweet_wo_RT_username_punct_split']

# tweet_list = [item for sublist in tweets for item in sublist] 
tweet_list = tweets.explode()

vectorizer = CountVectorizer()
vectorizer.fit(tweet_list)
vectorizer.vocabulary_


{'as': 1349,
 'woman': 23287,
 'you': 23649,
 'shouldnt': 16410,
 'complain': 4122,
 'about': 574,
 'cleaning': 3855,
 'up': 22288,
 'your': 23668,
 'house': 8950,
 'amp': 1028,
 'man': 11297,
 'should': 16406,
 'always': 976,
 'take': 17961,
 'the': 21200,
 'trash': 21747,
 'out': 13189,
 'boy': 2657,
 'dats': 4873,
 'coldtyga': 4008,
 'dwn': 5857,
 'bad': 1666,
 'for': 7033,
 'cuffin': 4657,
 'dat': 4865,
 'hoe': 8715,
 'in': 9263,
 '1st': 158,
 'place': 13866,
 'dawg': 4885,
 'ever': 6298,
 'fuck': 7267,
 'bitch': 2243,
 'and': 1073,
 'she': 16252,
 'start': 17357,
 'to': 21523,
 'cry': 4624,
 'be': 1915,
 'confused': 4205,
 'shit': 16319,
 'look': 10971,
 'like': 10738,
 'tranny': 21717,
 'hear': 8433,
 'me': 11527,
 'might': 11765,
 'true': 21879,
 'or': 13109,
 'it': 9591,
 'faker': 6517,
 'than': 21177,
 'who': 23084,
 'told': 21551,
 'ya': 23467,
 'just': 9966,
 'blows': 2452,
 'meclaim': 11554,
 'so': 16911,
 'faithful': 6511,
 'down': 5613,
 'somebody': 16979,
 'but': 3043,
 

In [11]:
tweets = dt_transformed[['tweet']]
type(tweets)

pandas.core.frame.DataFrame

In [12]:
[tweet for tweet in dt_transformed['tweet_wo_RT_username_punct_split']]

,
  'it'],
 ['',
  'I',
  'went',
  'to',
  'Golds',
  'with',
  'Cruz',
  'in',
  '2010',
  'when',
  'he',
  'was',
  'a',
  'Sound',
  'No',
  'upper',
  'lower',
  'body',
  'strength',
  'Hmm',
  'now',
  '40',
  'HR',
  'hitter',
  'Wont',
  'say',
  'the',
  'S',
  'word'],
 ['',
  'If',
  'Royals',
  'have',
  'a',
  'fire',
  'sale',
  'I',
  'want',
  'the',
  'whole',
  'team',
  'Leave',
  'Gardner',
  'and',
  'Tanaka',
  'The',
  'rest',
  'can',
  'please',
  'go',
  'Along',
  'with',
  'Joe',
  'and',
  'Cash'],
 ['', 'Nice', 'bounce', 'back', 'Kay', 'Gotta', 'keep', 'this', 'going'],
 ['', 'Pineda', 'needed', 'that', '67', 'Great', 'play'],
 ['',
  'Primo',
  'Just',
  'got',
  'home',
  'and',
  'the',
  'Tanks',
  'are',
  'on',
  'MLB',
  'Finally',
  'And',
  'they',
  'are',
  'winning',
  'Even',
  'better',
  'Time',
  'to',
  'get',
  'serious',
  'Sept',
  'call',
  'up',
  'time'],
 ['', 'got', 'beaten', 'by', 'a', 'thug'],
 ['',
  'take',
  'all',
  'three'