<a href="https://colab.research.google.com/github/Sanjivan28/HateSpeechDetectionML/blob/main/HateSpeechDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**HATE SPEECH DETECTION - MACHINE LEARNING**

In [9]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np

**IMPORT THE DATASET**

In [10]:
# Load the dataset
data = pd.read_csv("twitter.csv")
print(data.head())

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


In [18]:
import nltk
nltk.download('stopwords')
# Initialize the stemmer and stopwords
stemmer = nltk.SnowballStemmer("english")
stopword = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**MAP THE COLUMNS FOR HATE SPEECH**

In [13]:
# Map the class labels
data["labels"] = data["class"].map({0: "Hate Speech", 1: "Offensive Language", 2: "Non-offensive and normal Language"})
print(data.head(15))

    Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0            0      3            0                   0        3      2   
1            1      3            0                   3        0      1   
2            2      3            0                   3        0      1   
3            3      3            0                   2        1      1   
4            4      6            0                   6        0      1   
5            5      3            1                   2        0      1   
6            6      3            0                   3        0      1   
7            7      3            0                   3        0      1   
8            8      3            0                   3        0      1   
9            9      3            1                   2        0      1   
10          10      3            0                   3        0      1   
11          11      3            0                   3        0      1   
12          12      3            0    

In [14]:
# Select the relevant columns
data = data[["tweet", "labels"]]
print(data.head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                              labels  
0  Non-offensive and normal Language  
1                 Offensive Language  
2                 Offensive Language  
3                 Offensive Language  
4                 Offensive Language  


**CLEANING THE SENTENCE IN DATASET**

In [15]:
# Define the cleaning function
def clean(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with digits
    text = [stemmer.stem(word) for word in text.split() if word not in stopword]
    text = " ".join(text)

    return text

In [19]:
# Clean the tweets
data["cleaned_tweet"] = data["tweet"].apply(clean)
print(data.head())

                                               tweet  \
0  !!! RT @mayasolovely: As a woman you shouldn't...   
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   

                              labels  \
0  Non-offensive and normal Language   
1                 Offensive Language   
2                 Offensive Language   
3                 Offensive Language   
4                 Offensive Language   

                                       cleaned_tweet  
0  rt mayasolov woman shouldnt complain clean hou...  
1   rt boy dat coldtyga dwn bad cuffin dat hoe place  
2  rt urkindofbrand dawg rt ever fuck bitch start...  
3          rt c_g_anderson viva_bas look like tranni  
4  rt shenikarobert shit hear might true might fa...  


In [20]:
# Convert the cleaned tweets to a numpy array
x = np.array(data["cleaned_tweet"])
y = np.array(data["labels"])

In [21]:
# Vectorize the text data
cv = CountVectorizer()
X = cv.fit_transform(x)

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
# Train the classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [24]:
# Predict labels for all cleaned tweets
predictions = clf.predict(X)

In [25]:
# Add predictions to the dataframe
data["predictions"] = predictions

In [26]:
# Display the dataframe with predictions
print(data.head(20))

                                                tweet  \
0   !!! RT @mayasolovely: As a woman you shouldn't...   
1   !!!!! RT @mleew17: boy dats cold...tyga dwn ba...   
2   !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...   
3   !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...   
4   !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...   
5   !!!!!!!!!!!!!!!!!!"@T_Madison_x: The shit just...   
6   !!!!!!"@__BrighterDays: I can not just sit up ...   
7   !!!!&#8220;@selfiequeenbri: cause I'm tired of...   
8   " &amp; you might not get ya bitch back &amp; ...   
9   " @rhythmixx_ :hobbies include: fighting Maria...   
10  " Keeks is a bitch she curves everyone " lol I...   
11                 " Murda Gang bitch its Gang Land "   
12  " So hoes that smoke are losers ? " yea ... go...   
13      " bad bitches is the only thing that i like "   
14                            " bitch get up off me "   
15                    " bitch nigga miss me with it "   
16                             

In [27]:
# Optionally, save the results to a new CSV file
data.to_csv("twitter_with_predictions.csv", index=False)