In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("labeled_data.csv")

In [2]:
df.describe(), df.isnull().sum(), df.shape, df.info(), df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


(         Unnamed: 0         count   hate_speech  offensive_language  \
 count  24783.000000  24783.000000  24783.000000        24783.000000   
 mean   12681.192027      3.243473      0.280515            2.413711   
 std     7299.553863      0.883060      0.631851            1.399459   
 min        0.000000      3.000000      0.000000            0.000000   
 25%     6372.500000      3.000000      0.000000            2.000000   
 50%    12703.000000      3.000000      0.000000            3.000000   
 75%    18995.500000      3.000000      0.000000            3.000000   
 max    25296.000000      9.000000      7.000000            9.000000   
 
             neither         class  
 count  24783.000000  24783.000000  
 mean       0.549247      1.110277  
 std        1.113299      0.462089  
 min        0.000000      0.000000  
 25%        0.000000      1.000000  
 50%        0.000000      1.000000  
 75%        0.000000      1.000000  
 max        9.000000      2.000000  ,
 Unnamed: 0     

In [3]:
# drop unnecessary columns
df.drop(['Unnamed: 0'],
        axis=1, inplace=True)

# df.head()

In [4]:
import nltk
import re

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# define a function to clean the tweet column
def clean_text(text):
    """
    Function to clean a tweet by removing URLs, mentions, hashtags, punctuation,
    converting text to lowercase, and removing stopwords.
    """
    if isinstance (text, str):# to check if the text is a string
        text = re.sub(r"http\S+|www\S+|http\S+", "", text, flags=re.MULTILINE) #remove urls
        text = re.sub(r"\@\w+|\#", "", text) # remove any hashtags or mentions that could be present 
        text = re.sub(r"[^\w\s]", "", text) # remove anu punctuations
        text = text.lower() # convert characters to lowercases
        text = " " .join([word for word in text.split() if word not in stop_words]) # to remove stopwords
    return text


# check if the function works
df['cleaned_tweet'] = df['tweet'].apply(clean_text)
# df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nuell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.decomposition import PCA
from nltk.stem import WordNetLemmatizer


# Download required NLTK resources
nltk.download('punkt')
# nltk.download('stopwords')
nltk.download('wordnet')

# Text Preprocessing Function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    # stop_words = set(stopwords.words('english'))
    # tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to text column
# df['processed_text'] = df['tweet'].apply(preprocess_text)

df['cleaned_text'] = df['cleaned_tweet'].apply(preprocess_text)
# df.head()





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nuell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nuell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_df=0.8, max_features=100, stop_words='english')
X_vect = vectorizer.fit_transform(df['cleaned_text'])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=15) 
X_dense = X_vect.toarray()
reduced_features = pca.fit_transform(X_dense)

In [8]:
# Handle outliers using IQR method
def remove_outliers(X):
    Q1 = np.percentile(X, 25, axis=0)
    Q3 = np.percentile(X, 75, axis=0)
    IQR = Q3 - Q1
    outlier_mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
    return X[outlier_mask], outlier_mask

# Remove outliers
X_clean, outlier_mask = remove_outliers(reduced_features)
y_clean = df['class'][outlier_mask]

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


# Split data
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Scale data for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))




Logistic Regression Accuracy: 0.7607726597325408
              precision    recall  f1-score   support

           0       0.50      0.02      0.03       118
           1       0.85      0.82      0.84       862
           2       0.61      0.86      0.72       366

    accuracy                           0.76      1346
   macro avg       0.66      0.57      0.53      1346
weighted avg       0.76      0.76      0.73      1346



In [30]:
# trying with random forest

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7726597325408618
              precision    recall  f1-score   support

           0       0.47      0.08      0.13       118
           1       0.86      0.84      0.85       862
           2       0.64      0.83      0.72       366

    accuracy                           0.77      1346
   macro avg       0.66      0.58      0.57      1346
weighted avg       0.76      0.77      0.75      1346



In [38]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [150, 200, 250, 300], 
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['sqrt', 'log2']  
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Best Random Forest model
best_rf_model = rf_grid.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

print("Best Random Forest Parameters:", rf_grid.best_params_)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Best Random Forest Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 250}
Random Forest Accuracy: 0.7652303120356612
              precision    recall  f1-score   support

           0       0.33      0.02      0.03       118
           1       0.85      0.84      0.84       862
           2       0.63      0.84      0.72       366

    accuracy                           0.77      1346
   macro avg       0.60      0.56      0.53      1346
weighted avg       0.74      0.77      0.74      1346

