In [37]:
import re
import pandas as pd
import numpy as np
import pickle
import nltk
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

## Forming the dataframe

In [38]:
df = pd.read_json('../src/utility/final.json')
df = pd.DataFrame(df.data.values.tolist())
df.head()

Unnamed: 0,id,text,label,withheld
0,850490912954351616,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,abusive,
1,848791766853668864,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",abusive,
2,850010509969465344,RT @MailOnline: The Nazi death gas so horrific...,normal,
3,850433664890544128,I hate er chase because if the Bitch that work...,hateful,
4,849282894682050564,But he still with the shits so he started smok...,abusive,


In [39]:
df = df.drop(['id', 'withheld'], axis=1)
df.head()

Unnamed: 0,text,label
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,abusive
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",abusive
2,RT @MailOnline: The Nazi death gas so horrific...,normal
3,I hate er chase because if the Bitch that work...,hateful
4,But he still with the shits so he started smok...,abusive


In [80]:
lb = LabelEncoder()
label = lb.fit_transform(df["label"])
lb.classes_

array([0, 1, 2, 3])

In [81]:
df.head(20)

Unnamed: 0,text,label,clean_tweet
0,alex brosas another idiot aldubksgoestous,0,alex brosa anoth idiot aldubksgoest
1,as nancy reagan would say just say fucking...,0,nanci reagan would say say fuck someth like
2,the nazi death gas so horrific even hitler f...,2,nazi death ga horrif even hitler fear use
3,i hate er chase because if the bitch that work...,1,hate er chase bitch work liter evil
4,but he still with the shits so he started smok...,0,still shit start smoke drink bad combo probabl...
5,april fools fucking dope if you ain t feelin...,0,april fool fuck dope feel rigor morti dummi hi...
6,not having access to my money is fucking pissi...,0,access money fuck piss
7,niggas keep talking about women wearing weave ...,1,nigga keep talk women wear weav sick bitch fro...
8,god you re fucking pathetic,0,god fuck pathet
9,you worried about somebody bein ugly bitch ...,0,worri somebodi bein ugli bitch ugli


In [41]:
df = df.drop(["label"], axis='columns')
df.head()

Unnamed: 0,text
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ..."
2,RT @MailOnline: The Nazi death gas so horrific...
3,I hate er chase because if the Bitch that work...
4,But he still with the shits so he started smok...


In [42]:
df["label"]=label
df.head()

Unnamed: 0,text,label
0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,0
1,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",0
2,RT @MailOnline: The Nazi death gas so horrific...,2
3,I hate er chase because if the Bitch that work...,1
4,But he still with the shits so he started smok...,0


## Preprocessing the data

In [43]:
def cleanTweet(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) # Removing @mentions
    text = re.sub(r'#','',text) # Removing the '#' symbol
    text = re.sub(r'RT[\s]+','',text) # Removing RT
    text = re.sub(r'https?:\/\/\S+','',text) # Removing hyperlinks
    text = re.sub(r'[^a-zA-Z ]',' ', text) # Removing all the punctuations and numbers
    text = text.lower()
    return text
df['text'] = df['text'].apply(cleanTweet)
df.head()

Unnamed: 0,text,label
0,alex brosas another idiot aldubksgoestous,0
1,as nancy reagan would say just say fucking...,0
2,the nazi death gas so horrific even hitler f...,2
3,i hate er chase because if the bitch that work...,1
4,but he still with the shits so he started smok...,0


In [44]:
# Removing the stop words and tokeinzing the sentences
stop_words = set(stopwords.words('english'))
def removeStopWords(text):
    words = word_tokenize(text)
    filtered_sentence = [w for w in words if not w in stop_words]
    return filtered_sentence
tokenized_tweet = df['text'].apply(removeStopWords)
tokenized_tweet.head()

0      [alex, brosas, another, idiot, aldubksgoestous]
1    [nancy, reagan, would, say, say, fucking, some...
2    [nazi, death, gas, horrific, even, hitler, fea...
3     [hate, er, chase, bitch, works, literally, evil]
4    [still, shits, started, smoking, drinking, bad...
Name: text, dtype: object

In [45]:
# Stemming
stemmer = PorterStemmer()
def stemTweet(text):
    text = [stemmer.stem(word) for word in text]
    return text
tokenized_tweet = tokenized_tweet.apply(stemTweet)
tokenized_tweet.head()

0            [alex, brosa, anoth, idiot, aldubksgoest]
1    [nanci, reagan, would, say, say, fuck, someth,...
2    [nazi, death, ga, horrif, even, hitler, fear, ...
3          [hate, er, chase, bitch, work, liter, evil]
4    [still, shit, start, smoke, drink, bad, combo,...
Name: text, dtype: object

In [46]:
tweet=[]
for i in range(len(tokenized_tweet)):
    s = tokenized_tweet[i]
    sent = ' '.join([str(elem) for elem in s])
    tweet.append(sent)
df['clean_tweet'] = tweet
df['clean_tweet'][1]

'nanci reagan would say say fuck someth like'

## Feature Extraction

In [47]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(df['clean_tweet'])
print(type(df['clean_tweet']))

<class 'pandas.core.series.Series'>


In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector, df['label'], random_state=42, test_size=0.25)

## Model Training

In [49]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(bootstrap=True)
# model.fit(x_train, y_train)

In [50]:
# # model.score(x_test, y_test)
# from sklearn.metrics import f1_score, accuracy_score
# pred = model.predict(x_test)
# f1_score(y_test, pred, average=None)
# accuracy_score(y_test, pred)

In [51]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'elasticnet', warm_start = True, max_iter = 1000,  C=1.3, solver='saga', l1_ratio=0.9)
model.fit(x_train, y_train)

LogisticRegression(C=1.3, l1_ratio=0.9, max_iter=1000, penalty='elasticnet',
                   solver='saga', warm_start=True)

In [52]:
from sklearn.metrics import f1_score, accuracy_score
pred = model.predict(x_test)
f1_score(y_test, pred, average=None)
accuracy_score(y_test, pred)

0.7964855314678636

In [53]:
# fname = 'logistic_regression_model.sav'
# pickle.dump(model, open(fname, 'wb'))

In [99]:
inp = 'you are a nigga'
inp = cleanTweet(inp)
inp = removeStopWords(inp)
inp = stemTweet(inp)
inp = ' '.join([str(elem) for elem in inp])
data = pd.Series([inp])
new_data = df['clean_tweet'].append(data)
vector_new = vectorizer.fit_transform(new_data)
vector_new

<41882x43033 sparse matrix of type '<class 'numpy.float64'>'
	with 395286 stored elements in Compressed Sparse Row format>

In [100]:
model.predict(vector_new)

array([0, 0, 2, ..., 2, 2, 1])