## Hate Speech Detection Model

In [None]:
from google.colab import files
uploaded = files.upload()

In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
print("Training Set:"% train.columns, train.shape, len(train))

test = pd.read_csv('test.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Training Set: (31962, 3) 31962
Test Set: (17197, 2) 17197


In [5]:
train.head(5)

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for lyft credit i cant use cause they...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation


In [21]:
test.head(5)

Unnamed: 0,id,tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ...
2,31965,safe ways to heal your acne altwaystoheal h...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew el...


#### The label column denote if the tweet is hate speech or not so lets analyze it

In [22]:
print(train['label'].value_counts())

label
0    29720
1     2242
Name: count, dtype: int64


### This is a clear case on Imbalanced Data

#### Exploratory Data Analysis on a text column

In [None]:
train["tweet"].head(15)

In [3]:
import re

# using helper function to remove characters like '#', '()', '/', '!' etc and all puntuation marks from words, remove words with characters like '@' attached to them, remove https words etc
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

test_clean = clean_text(test, "tweet")
train_clean = clean_text(train, "tweet")

In [4]:
train_clean["tweet"].head(15)

0       when a father is dysfunctional and is so sel...
1       thanks for lyft credit i cant use cause they...
2                                   bihday your majesty
3     model   i love u take with u all the time in u...
4                  factsguide society now    motivation
5     22 huge fan fare and big talking before they l...
6                         camping tomorrow        danny
7     the next school year is the year for exams can...
8     we won love the land allin cavs champions clev...
9                      welcome here   im   its so gr8  
10      ireland consumer price index mom climbed fro...
11    we are so selfish orlando standwithorlando pul...
12      i get to see my daddy today   80days gettingfed
13     cnn calls michigan middle school build the wa...
14    no comment  in australia   opkillingbay seashe...
Name: tweet, dtype: object

In [9]:
# creating new dataframes based on the stated column values for resampling
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

In [24]:
print(train_majority['label'].value_counts())
print()
print(train_minority['label'].value_counts())

label
0    29720
Name: count, dtype: int64

label
1    2242
Name: count, dtype: int64


#### So we need to do Oversampling on the lesser class which is train_minority object

In [None]:
train_minority.head(5)

In [11]:
# Resample the data to balance it
from sklearn.utils import resample

train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)

In [None]:
train_minority_upsampled.head(5)

In [13]:
train_upsampled = pd.concat([train_minority_upsampled, train_majority])  # joining/combining both dataframes

train_upsampled['label'].value_counts()

label
1    29720
0    29720
Name: count, dtype: int64

#### We can see that the unique values in label column (which denotes hate speach or not) is now balanced
#### This is our new balanced data we can model with

### SGD Classifier for modeling

In [14]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier


#### Text Conversion and Analysis

In [15]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 0)

In [17]:
model = pipeline_sgd.fit(X_train, y_train)

#### Test data has no label column so we predict the test data to get its label column

In [None]:
# Evaluate/Predict on testing data
y_predict = model.predict(X_test)

In [27]:
y_predict

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [18]:
from sklearn.metrics import f1_score

f1_score(y_test, y_predict)

0.9697899299766589