### Text Classification

In [3]:
import pandas as pd
dataset = pd.read_csv('hate_speech.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
dataset.shape

(5242, 3)

In [5]:
dataset.label.value_counts()

0    3000
1    2242
Name: label, dtype: int64

In [6]:
for index, tweet in enumerate(dataset["tweet"][10:15]):
    print(index+1,"-",tweet)

1 -  â #ireland consumer price index (mom) climbed from previous 0.2% to 0.5% in may   #blog #silver #gold #forex
2 - we are so selfish. #orlando #standwithorlando #pulseshooting #orlandoshooting #biggerproblems #selfish #heabreaking   #values #love #
3 - i get to see my daddy today!!   #80days #gettingfed
4 - ouch...junior is angryð#got7 #junior #yugyoem   #omg 
5 - i am thankful for having a paner. #thankful #positive     


In [7]:
import re
def clean_text(text):
    text=re.sub(r"[^a-zA-Z\']","",text)
    text=re.sub(r"[^\x00-\x7F]+"," ",text)
    text=text.lower()
    return text

In [8]:
dataset['clean_text']=dataset.tweet.apply(lambda x:clean_text(x))

In [9]:
dataset.head(10)

Unnamed: 0,id,label,tweet,clean_text
0,1,0,@user when a father is dysfunctional and is s...,userwhenafatherisdysfunctionalandissoselfishhe...
1,2,0,@user @user thanks for #lyft credit i can't us...,useruserthanksforlyftcreditican'tusecausetheyd...
2,3,0,bihday your majesty,bihdayyourmajesty
3,4,0,#model i love u take with u all the time in ...,modeliloveutakewithuallthetimeinur
4,5,0,factsguide: society now #motivation,factsguidesocietynowmotivation
5,6,0,[2/2] huge fan fare and big talking before the...,hugefanfareandbigtalkingbeforetheyleavechaosan...
6,7,0,@user camping tomorrow @user @user @user @use...,usercampingtomorrowuseruseruseruseruseruseruse...
7,8,0,the next school year is the year for exams.ð...,thenextschoolyearistheyearforexamscan'tthinkab...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,wewonlovethelandallincavschampionsclevelandcle...
9,10,0,@user @user welcome here ! i'm it's so #gr...,useruserwelcomeherei'mit'ssogr


In [10]:
from nltk.corpus import stopwords
len(stopwords.words('english'))


179

In [11]:
stop = stopwords.words('english')

In [12]:
def gen_freq(text):
    word_list = []
    for tw_words in text.split():
        word_list.extend(tw_words)
    word_freq = pd.Series(word_list).value_counts()
    word_freq = word_freq.drop(stop, errors='ignore')
    return word_freq

In [13]:
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'not'] or re.search(r"\wn't", word):
            return 1
        else:
            return 0

In [14]:
def any_rare(words,rare_100):
    for word in words:
        if word in rare_100:
            return 1
        else:
            return 0

In [15]:
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who', 'where']:
            return 1
        else:
            return 0

In [16]:
word_freq=gen_freq(dataset.clean_text.str)
rare_100=word_freq[-100:]
dataset['word_count']=dataset.clean_text.str.split().apply(lambda x:len(x))
dataset['any_neg']=dataset.clean_text.str.split().apply(lambda x:any_neg(x))
dataset['any_rare']=dataset.clean_text.str.split().apply(lambda x:any_rare(x,rare_100))
dataset['is_question']=dataset.clean_text.str.split().apply(lambda x:is_question(x))
dataset['char_count']=dataset.clean_text.apply(lambda x:len(x))

### 06-02-2025

In [17]:
dataset.head(10)

Unnamed: 0,id,label,tweet,clean_text,word_count,any_neg,any_rare,is_question,char_count
0,1,0,@user when a father is dysfunctional and is s...,userwhenafatherisdysfunctionalandissoselfishhe...,1,0,0,0,79
1,2,0,@user @user thanks for #lyft credit i can't us...,useruserthanksforlyftcreditican'tusecausetheyd...,1,1,0,0,95
2,3,0,bihday your majesty,bihdayyourmajesty,1,0,0,0,17
3,4,0,#model i love u take with u all the time in ...,modeliloveutakewithuallthetimeinur,1,0,0,0,34
4,5,0,factsguide: society now #motivation,factsguidesocietynowmotivation,1,0,0,0,30
5,6,0,[2/2] huge fan fare and big talking before the...,hugefanfareandbigtalkingbeforetheyleavechaosan...,1,0,0,0,88
6,7,0,@user camping tomorrow @user @user @user @use...,usercampingtomorrowuseruseruseruseruseruseruse...,1,0,0,0,52
7,8,0,the next school year is the year for exams.ð...,thenextschoolyearistheyearforexamscan'tthinkab...,1,1,0,0,105
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,wewonlovethelandallincavschampionsclevelandcle...,1,0,0,0,61
9,10,0,@user @user welcome here ! i'm it's so #gr...,useruserwelcomeherei'mit'ssogr,1,0,0,0,30


In [28]:
from sklearn.model_selection import train_test_split
X=dataset[['word_count','any_neg','any_rare','is_question']]
y=dataset.label
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

### Train an ML Model using Tc

In [31]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model=model.fit(X_train,y_train)
pred=model.predict(X_test)

In [32]:
model.predict(X_test[5:10])

array([1, 1, 1, 1, 1], dtype=int64)

### Evaluate the ML model

In [33]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, pred)*100, "%")

Accuracy: 44.518589132507145 %


In [36]:
from sklearn.ensemble import RandomForestClassifier

clf_rf=RandomForestClassifier()

clf_rf.fit(X_train,y_train)

rf_pred=clf_rf.predict(X_test.astype(int))

In [41]:
from sklearn.metrics import classification_report,confusion_matrix, \
accuracy_score
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred))
print("Accuracy:",accuracy_score(y_test, rf_pred))

[[559  40]
 [415  35]]
              precision    recall  f1-score   support

           0       0.57      0.93      0.71       599
           1       0.47      0.08      0.13       450

    accuracy                           0.57      1049
   macro avg       0.52      0.51      0.42      1049
weighted avg       0.53      0.57      0.46      1049

Accuracy: 0.5662535748331744


In [43]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

In [44]:
y_pred = logreg.predict(X_test)

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.93      0.71       599
           1       0.47      0.08      0.13       450

    accuracy                           0.57      1049
   macro avg       0.52      0.51      0.42      1049
weighted avg       0.53      0.57      0.46      1049

