In [None]:
##importing necessary libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [None]:
## loading train data
train_df=pd.read_csv("C:/Users/vanat/Downloads/train.csv")
train_df.head()

In [None]:
## Loading test data
test_df=pd.read_csv("C:/Users/vanat/Downloads/test.csv")
test_df.head()

In [None]:
train_df.info()

In [None]:
train_df.drop('id',axis=1,inplace=True)
test_df.drop('id',axis=1,inplace=True)

In [None]:
test_df.head(2)

# Text preprocessing

In [None]:
train_df['label'].value_counts()

In [None]:
#Hate speech 
hate_tweet=train_df[train_df['label']==1]['tweet']
hate_tweet

In [None]:
# Non Hate speech 
non_hate_tweet=train_df[train_df['label']==0]['tweet']
non_hate_tweet

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud
# Combine all hate speech tweets into a single string
hate_speech_text = " ".join(hate_tweet)

# Combine all non-hate speech tweets into a single string
non_hate_speech_text = " ".join(non_hate_tweet)

# Generate word clouds
hate_wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(hate_speech_text)
non_hate_wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate(non_hate_speech_text)

# Display the word clouds
plt.figure(figsize=(15, 8))

# Hate Speech Word Cloud
plt.subplot(1, 2, 1)
plt.imshow(hate_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud for Hate Speech Tweets", fontsize=16)

# Non-Hate Speech Word Cloud
plt.subplot(1, 2, 2)
plt.imshow(non_hate_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud for Non-Hate Speech Tweets", fontsize=16)

plt.tight_layout()
plt.show()


In [None]:
## since the word user does not posses any valid info here we can remove it..because they are more widely spread in both hate and non hate speach

# Function to remove the word "user" from a tweet
def remove_user_word(tweet):
    return " ".join([word for word in tweet.split() if word.lower() != "@user"])

# Apply the function to the 'tweet' column in train and test datasets
train_df['tweet'] = train_df['tweet'].apply(remove_user_word)
test_df['tweet'] = test_df['tweet'].apply(remove_user_word)

# Display the first few rows of each dataframe to verify
print("Updated Train DataFrame:")
print(train_df.head())

print("\nUpdated Test DataFrame:")
print(test_df.head())


In [None]:
##removal of punctuation

import string
string.punctuation

def remove_punctuations(tweet):
    punctuations =string.punctuation
    return tweet.translate(str.maketrans('','',punctuations))

In [None]:
train_df['tweet']=train_df['tweet'].apply(lambda x:remove_punctuations (x))
test_df['tweet']=test_df['tweet'].apply(lambda x:remove_punctuations (x))

In [None]:
## Removing special characters

import re
train_df['tweet']=train_df['tweet'].apply(lambda x:re.sub(r'[^\w ]+', "",x))
test_df['tweet']=test_df['tweet'].apply(lambda x:re.sub(r'[^\w ]+', "",x))
train_df['tweet']=train_df['tweet'].apply(lambda x:re.sub('[^a-zA-Z0-9]',' ',x))
test_df['tweet']=test_df['tweet'].apply(lambda x:re.sub('[^a-zA-Z0-9]',' ',x))
train_df['tweet']=train_df['tweet'].apply(lambda x:re.sub('\d+',' ',x))
test_df['tweet']=test_df['tweet'].apply(lambda x:re.sub('\d+',' ',x))

##Removing extra spaces
train_df['tweet']=train_df['tweet'].apply(lambda x:' '.join(x.split()))
test_df['tweet']=test_df['tweet'].apply(lambda x:' '.join(x.split()))

In [None]:
## Converting to Lowercase
train_df['tweet'].str.lower()
test_df['tweet'].str.lower()

In [None]:
##rare words removal for train data

from collections import Counter
word_count=Counter()
for text in train_df['tweet']:
    for word in text.split():
        word_count[word] += 1
word_count.most_common(10)

rare_words=set((word,wc) for (word,wc) in word_count.most_common()[:-100:-1])

def remove_rare_words(tweet):
    return " ".join([word for word in tweet.split() if word not in rare_words])

In [None]:
train_df=train_df.applymap(lambda x: remove_rare_words(x)if isinstance(x, str) else x)
train_df.head(5)

In [None]:
##rare words removal for test data

from collections import Counter
word_count_1=Counter()
for text in test_df['tweet']:
    for word in text.split():
        word_count_1[word] += 1
word_count_1.most_common(10)

rare_words_1=set((word,wc) for (word,wc) in word_count_1.most_common()[:-100:-1])

def remove_rare_words_1(tweet):
    return " ".join([word for word in tweet.split() if word not in rare_words_1])

In [None]:
test_df=test_df.applymap(lambda x: remove_rare_words_1(x)if isinstance(x, str) else x)
test_df.head(5)

In [None]:
# Removal of stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
from nltk.corpus import stopwords as nltk_stopwords
# Create a set of stopwords
stopwords_set = set(nltk_stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word.lower() not in stopwords_set])

In [None]:
train_df['tweet']= train_df['tweet'].apply(remove_stopwords)
test_df['tweet']= test_df['tweet'].apply(remove_stopwords)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initializing stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


In [None]:
def preprocess_text(text, method='lemmatization'):
    # Tokenising the text
    tokens = word_tokenize(text)
    # Apply stemming / lemmatization
    if method == 'stemming':
        tokens = [stemmer.stem(token) for token in tokens]
    elif method == 'lemmatization':
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Reconstruct the text
    return ' '.join(tokens)


In [None]:
train_df['tweet'] = train_df['tweet'].apply(lambda x: preprocess_text(x, method='lemmatization'))
test_df['tweet'] = test_df['tweet'].apply(lambda x: preprocess_text(x, method='lemmatization'))

In [None]:
train_df['tweet'].head(10)

In [None]:
# Now , since we have completed the preprocessing step, next step is to create features for model.  
# Since we have unbalanced labels, we here use SMOTE technique to balance the data