In [2]:
#importing library and datasets
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape, test.shape

((31962, 3), (17197, 2))

In [3]:
train.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [4]:
test.head(10)

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."
5,31968,choose to be :) #momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...
8,31971,@user @user @user i will never understand why...
9,31972,#delicious #food #lovelife #capetown mannaep...


In [6]:
#data preprocessing

import re
def clean(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda w: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])(\w+:\/\/\S+)|^rt|http.+?", "", w))
    return df

train_clean = clean(train, "tweet")
test_clean = clean(test, "tweet")

In [7]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so sel...
1,2,0,thanks for #lyft credit i can't use cause th...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [9]:
#checking whether dataset is balanced or imbalanced
train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

Tweets regarding hate speeches are compartively lesser than others. So it is an imbalanced dataset

In [11]:
#performing upsampling to handle imbalanced dataset
from sklearn.utils import resample
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]
train_minority_upsampled = resample(train_minority,
                                  replace = True,
                                  n_samples = len(train_majority),
                                  random_state = 123)

train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [13]:
#creating a pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('nb', SGDClassifier()),
])


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'], train_upsampled['label'],random_state=0)

In [16]:
model = pipeline_sgd.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  indices = (scores > 0).astype(np.int)


0.9693734569960631

1095       bihday #brother #cool #thebomb #åææ #ã...
9766     gf asked me to make an account. i told her i d...
26909     #sunday   withyou #happiness #family @ vinallop 
1386     been feeling low for ages and when the one per...
4616     chaplin - the dictator speech  via   #theresis...
Name: tweet, dtype: object