In [1]:
import re
import pandas as pd
import numpy as np
import pickle
import nltk
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('labeled_data.csv')
df = df.drop(["Unnamed: 0","count","hate_speech","offensive_language","neither"], axis=1)
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
def cleanTweet(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) # Removing @mentions
    text = re.sub(r'#','',text) # Removing the '#' symbol
    text = re.sub(r'RT[\s]+','',text) # Removing RT
    text = re.sub(r'https?:\/\/\S+','',text) # Removing hyperlinks
    text = re.sub(r'[^a-zA-Z ]',' ', text) # Removing all the punctuations and numbers
    text = text.lower()
    return text
df['tweet'] = df['tweet'].apply(cleanTweet)
df.head()

Unnamed: 0,class,tweet
0,2,as a woman you shouldn t complain about ...
1,1,boy dats cold tyga dwn bad for cuffi...
2,1,dawg you ever fuck a bitch and s...
3,1,g anderson based she look like a ...
4,1,the shit you hear about me mig...


In [5]:
stop_words = set(stopwords.words('english'))
def removeStopWords(text):
    words = word_tokenize(text)
    filtered_sentence = [w for w in words if not w in stop_words]
    return filtered_sentence
tokenized_tweet = df['tweet'].apply(removeStopWords)
tokenized_tweet.head()

0    [woman, complain, cleaning, house, amp, man, a...
1    [boy, dats, cold, tyga, dwn, bad, cuffin, dat,...
2    [dawg, ever, fuck, bitch, start, cry, confused...
3             [g, anderson, based, look, like, tranny]
4    [shit, hear, might, true, might, faker, bitch,...
Name: tweet, dtype: object

In [6]:
stemmer = PorterStemmer()
def stemTweet(text):
    text = [stemmer.stem(word) for word in text]
    return text
tokenized_tweet = tokenized_tweet.apply(stemTweet)
tokenized_tweet.head()

0    [woman, complain, clean, hous, amp, man, alway...
1    [boy, dat, cold, tyga, dwn, bad, cuffin, dat, ...
2    [dawg, ever, fuck, bitch, start, cri, confus, ...
3              [g, anderson, base, look, like, tranni]
4    [shit, hear, might, true, might, faker, bitch,...
Name: tweet, dtype: object

In [7]:
tweet=[]
for i in range(len(tokenized_tweet)):
    s = tokenized_tweet[i]
    sent = ' '.join([str(elem) for elem in s])
    tweet.append(sent)
df['clean_tweet'] = tweet
df['clean_tweet'][1]

'boy dat cold tyga dwn bad cuffin dat hoe st place'

In [8]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(df['clean_tweet'])
print(type(df['clean_tweet']))

<class 'pandas.core.series.Series'>


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(vector, df['class'], random_state=42, test_size=0.25)

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'elasticnet', warm_start = True, max_iter = 1000,  C=1.3, solver='saga', l1_ratio=0.9)
model.fit(x_train, y_train)

LogisticRegression(C=1.3, l1_ratio=0.9, max_iter=1000, penalty='elasticnet',
                   solver='saga', warm_start=True)

In [11]:
from sklearn.metrics import f1_score, accuracy_score
pred = model.predict(x_test)
f1_score(y_test, pred, average=None)
accuracy_score(y_test, pred)

0.9067140090380891

In [38]:
def cleanInput(text):    
    text = cleanTweet(text)
    text = removeStopWords(text)
    text = stemTweet(text)
    text = ' '.join([str(elem) for elem in text])
    text = [text]
    return text
inp = 'White people are Slave'
inp = cleanInput(inp)
data = vectorizer.transform(inp)
data

<1x16825 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [39]:
model.predict(data)

array([0], dtype=int64)