In [16]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

### In the `read_csv` function, we have passed a parameter for *encoding*, because our data set contains non-english words that's not supported by the default pandas `read_csv` function. 

In [17]:
dataset = pd.read_csv('./labeled_data.csv', encoding='ISO-8859-1')
dataset.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


In [19]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [20]:
dt_transformed = dataset[['class', 'tweet']]
dt_transformed.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Cleaning the labeled data

In [21]:
#remove user names by pulling all the characters inbetween "@" and ":"

import re

def remove_RT_user(text):
    tweet = re.sub("@[^\s]+", "", text)
    hashtag = re.sub("#[^\s]+", "", tweet)
    no_rt = re.sub("RT", "", hashtag)
    return no_rt

dt_transformed['tweet_wo_RT_username'] = dt_transformed['tweet'].apply(lambda x: remove_RT_user(x))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...


In [22]:
# removing punctuation

import string
print(string.punctuation)

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

dt_transformed['tweet_wo_RT_username_punct'] = dt_transformed['tweet_wo_RT_username'].apply(lambda x: remove_punctuation(x))
dt_transformed.head()


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...


In [23]:
# Tokenization = splitting strings into words

def tokenize(text):
    split = re.split("\W+", text)
    return split

dt_transformed['tweet_wo_RT_username_punct_split'] = dt_transformed['tweet_wo_RT_username_punct'].apply(lambda x: tokenize(x.lower()))
dt_transformed.head()

Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...,"[, as, a, woman, you, shouldnt, complain, abou..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...,"[, boy, dats, coldtyga, dwn, bad, for, cuffin,..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...,"[, dawg, you, ever, fuck, a, bitch, and, she, ..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny,"[, she, look, like, a, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...,"[, the, shit, you, hear, about, me, might, be,..."


In [24]:
stopword = nltk.corpus.stopwords.words('english')
print(stopword[:11])

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

dt_transformed['tweet_wo_RT_username_punct_split_stopwords'] = dt_transformed['tweet_wo_RT_username_punct_split'].apply(lambda x: remove_stopwords(x))
dt_transformed.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've"]


Unnamed: 0,class,tweet,tweet_wo_RT_username,tweet_wo_RT_username_punct,tweet_wo_RT_username_punct_split,tweet_wo_RT_username_punct_split_stopwords
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,!!! As a woman you shouldn't complain about ...,As a woman you shouldnt complain about clea...,"[, as, a, woman, you, shouldnt, complain, abou...","[, woman, shouldnt, complain, cleaning, house,..."
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,!!!!! boy dats cold...tyga dwn bad for cuffi...,boy dats coldtyga dwn bad for cuffin dat ho...,"[, boy, dats, coldtyga, dwn, bad, for, cuffin,...","[, boy, dats, coldtyga, dwn, bad, cuffin, dat,..."
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,!!!!!!! Dawg!!!! You ever fuck a bitch and...,Dawg You ever fuck a bitch and she start ...,"[, dawg, you, ever, fuck, a, bitch, and, she, ...","[, dawg, ever, fuck, bitch, start, cry, confus..."
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,!!!!!!!!! she look like a tranny,she look like a tranny,"[, she, look, like, a, tranny]","[, look, like, tranny]"
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,!!!!!!!!!!!!! The shit you hear about me mig...,The shit you hear about me might be true or...,"[, the, shit, you, hear, about, me, might, be,...","[, shit, hear, might, true, might, faker, bitc..."


In [29]:

# Bag of Words

tweets = dt_transformed[['tweet_wo_RT_username_punct_split_stopwords']]
tweet_list = [tweet for tweet in dt_transformed['tweets']]

vectorizer = CountVectorizer()
vectorizer.fit(tweet_list[:100])
vectorizer.vocabulary_

KeyError: 'tweets'

In [26]:
tweets = dt_transformed[['tweet']]
type(tweets)

pandas.core.frame.DataFrame

In [27]:
[tweet for tweet in dt_transformed['tweet_wo_RT_username_punct_split_stopwords']]

ter',
  'dash',
  'niqqa',
  'whos',
  'actually',
  'taking',
  'time',
  'walking',
  'amp',
  'u',
  'still',
  'get',
  'killed'],
 ['',
  'november',
  '10th',
  'get',
  'early',
  'bird',
  'tickets',
  'httptco7xcl4kyiuv',
  'httptcozcnp1ljoow'],
 ['', '44', 'road', 'perfect', 'way', 'continue', 'finished', ''],
 ['', 'valerie', 'jarrett', 'control', 'reflex', 'bitch', 'slap'],
 ['',
  'little',
  'brother',
  'hate',
  'admit',
  'knows',
  'hes',
  'still',
  'lil',
  'bitch',
  'though',
  'fuck'],
 ['',
  'yall',
  'bitches',
  'gon',
  'breast',
  'feed',
  'wit',
  'nipple',
  'rings',
  ''],
 ['',
  'slapping',
  'hoe',
  'tuh',
  'solve',
  'something',
  'ima',
  'leave',
  'alonei',
  'never',
  'usually',
  'listen'],
 ['',
  '218',
  'covington',
  'reads',
  'responses',
  'amp',
  'mocks',
  'fbi',
  'andy',
  'real',
  'politics',
  'liv',
  'awakening',
  'gretchen',
  'httptcot9jyhancpr'],
 ['', 'bitch', ''],
 ['', 'still', 'bitch', 'got', 'one', 'wrong', 'wome