In [1]:
import spacy
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from textblob import TextBlob
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline


In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [3]:
# Define a custom spaCy tokenizer with POS tagging
def spacy_tokenizer_with_pos(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    pos_tags = [token.pos_ for token in doc if token.is_alpha and not token.is_stop]
    return tokens, pos_tags

In [4]:
# Function to perform sentiment analysis
def sentiment_analysis(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

In [5]:
# Function to read text from files based on file_id
def read_text_from_file(file_id):
    file_path = os.path.join(text_folder, f'{file_id}.txt')
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ''

In [6]:
# Load data
data = pd.read_csv('data\Annotations_Metadata.csv')

In [7]:
data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label
0,12834217_1,572066,1346,0,noHate
1,12834217_2,572066,1346,0,noHate
2,12834217_3,572066,1346,0,noHate
3,12834217_4,572066,1346,0,hate
4,12834217_5,572066,1346,0,noHate


In [8]:
data.shape

(10944, 5)

In [9]:
# Data Quality Check: Check for missing values
missing_values = data.isnull().sum()
if missing_values.any():
    print("Missing values detected. Please handle them before proceeding.")
    print("Missing Value Counts:")
else:
    print(missing_values)


file_id         0
user_id         0
subforum_id     0
num_contexts    0
label           0
dtype: int64


In [10]:
# Filter out labels other than "hate" and "noHate"
filtered_data = data[data['label'].isin(['hate', 'noHate'])]

In [11]:
# Reset index
filtered_data.reset_index(drop=True, inplace=True)

In [12]:
filtered_data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label
0,12834217_1,572066,1346,0,noHate
1,12834217_2,572066,1346,0,noHate
2,12834217_3,572066,1346,0,noHate
3,12834217_4,572066,1346,0,hate
4,12834217_5,572066,1346,0,noHate


In [13]:
filtered_data.shape

(10703, 5)

In [14]:
text_folder="data\Text file"

In [15]:
# Preprocess and tokenize the text with POS tagging and sentiment analysis
filtered_data['actual_text'] = filtered_data['file_id'].apply(lambda file_id: read_text_from_file(file_id))
filtered_data['tokens'], filtered_data['pos_tags'] = zip(*filtered_data['actual_text'].apply(spacy_tokenizer_with_pos))
filtered_data['sentiment'] = filtered_data['actual_text'].apply(sentiment_analysis)
filtered_data['preprocessed_text'] = filtered_data['tokens'].apply(lambda tokens: ' '.join(tokens))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['actual_text'] = filtered_data['file_id'].apply(lambda file_id: read_text_from_file(file_id))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['tokens'], filtered_data['pos_tags'] = zip(*filtered_data['actual_text'].apply(spacy_tokenizer_with_pos))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

In [16]:
filtered_data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,actual_text,tokens,pos_tags,sentiment,preprocessed_text
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...","[March, booklet, download, time, counting]","[PROPN, NOUN, VERB, NOUN, NOUN]",0.0,March booklet download time counting
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,"[order, help, increase, booklet, download, gre...","[NOUN, VERB, VERB, NOUN, NOUN, ADJ, NOUN, PROP...",0.4,order help increase booklet download great sto...
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,"[simply, copy, paste, following, text, YouTube...","[ADV, VERB, VERB, ADJ, NOUN, PROPN, NOUN, NOUN...",0.0,simply copy paste following text YouTube video...
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,"[click, free, download, colorfully, illustrate...","[VERB, ADJ, NOUN, ADV, VERB, NOUN, NOUN, NOUN,...",0.233333,click free download colorfully illustrate page...
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,"[click, DOWNLOAD, mb, green, banner, link]","[VERB, PROPN, NOUN, PROPN, PROPN, PROPN]",-0.2,click DOWNLOAD mb green banner link


In [17]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10703 entries, 0 to 10702
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   file_id            10703 non-null  object 
 1   user_id            10703 non-null  int64  
 2   subforum_id        10703 non-null  int64  
 3   num_contexts       10703 non-null  int64  
 4   label              10703 non-null  object 
 5   actual_text        10703 non-null  object 
 6   tokens             10703 non-null  object 
 7   pos_tags           10703 non-null  object 
 8   sentiment          10703 non-null  float64
 9   preprocessed_text  10703 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 836.3+ KB


In [18]:
# Data Quality Check: Check for empty preprocessed text
empty_texts_index = filtered_data[filtered_data['preprocessed_text'] == ''].index
empty_texts_count = len(empty_texts_index)
if empty_texts_count > 0:
    print("Empty preprocessed text detected. Please check and preprocess the text accordingly.")
    print("Number of Rows with Empty Preprocessed Text:", empty_texts_count)
    print("Indices with Empty Preprocessed Text:")
    print(empty_texts_index)
else:
    print("No Empty Texts Found")


Empty preprocessed text detected. Please check and preprocess the text accordingly.
Number of Rows with Empty Preprocessed Text: 217
Indices with Empty Preprocessed Text:
Index([   80,    90,   106,   160,   205,   233,   244,   321,   325,   337,
       ...
       10075, 10190, 10321, 10395, 10419, 10501, 10529, 10580, 10636, 10702],
      dtype='int64', length=217)


In [19]:
# Investigate rows with empty preprocessed text
empty_texts_indices = [str(index) for index in empty_texts_index]
print("Indices with Empty Preprocessed Text:", ", ".join(empty_texts_indices))

# Print the corresponding actual texts for these rows
print("\nCorresponding Actual Texts:")
for index in empty_texts_index:
    print(filtered_data.loc[index, 'actual_text'])


Indices with Empty Preprocessed Text: 80, 90, 106, 160, 205, 233, 244, 321, 325, 337, 380, 428, 481, 513, 760, 847, 857, 883, 925, 965, 976, 1090, 1127, 1156, 1205, 1213, 1214, 1216, 1318, 1320, 1402, 1416, 1420, 1460, 1611, 1614, 1623, 1698, 1750, 1779, 1853, 2006, 2110, 2208, 2261, 2317, 2370, 2386, 2401, 2408, 2433, 2441, 2487, 2488, 2502, 2510, 2516, 2581, 2622, 2732, 2738, 2764, 2778, 2783, 2789, 2791, 2933, 3000, 3116, 3160, 3197, 3220, 3327, 3371, 3446, 3505, 3519, 3545, 3549, 3558, 3581, 3607, 3624, 3626, 3662, 3681, 3781, 3806, 3869, 3887, 3890, 3893, 3934, 3966, 4251, 4305, 4415, 4494, 4546, 4548, 4574, 4576, 4657, 4750, 4779, 4800, 4842, 4853, 4935, 4970, 5112, 5248, 5326, 5328, 5416, 5482, 5541, 5701, 5708, 5782, 5787, 5794, 5874, 5906, 5955, 5971, 5999, 6033, 6042, 6051, 6146, 6189, 6197, 6478, 6494, 6586, 6720, 6748, 6791, 6843, 6875, 6894, 6974, 6988, 6996, 6997, 7000, 7019, 7069, 7077, 7362, 7365, 7403, 7405, 7458, 7499, 7508, 7511, 7532, 7563, 7568, 7773, 7780, 7816, 7

In [20]:
# Remove rows with empty preprocessed text
filtered_data.drop(empty_texts_index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.drop(empty_texts_index, inplace=True)


In [21]:
filtered_data.shape

(10486, 10)

In [22]:
filtered_data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,actual_text,tokens,pos_tags,sentiment,preprocessed_text
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...","[March, booklet, download, time, counting]","[PROPN, NOUN, VERB, NOUN, NOUN]",0.0,March booklet download time counting
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,"[order, help, increase, booklet, download, gre...","[NOUN, VERB, VERB, NOUN, NOUN, ADJ, NOUN, PROP...",0.4,order help increase booklet download great sto...
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,"[simply, copy, paste, following, text, YouTube...","[ADV, VERB, VERB, ADJ, NOUN, PROPN, NOUN, NOUN...",0.0,simply copy paste following text YouTube video...
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,"[click, free, download, colorfully, illustrate...","[VERB, ADJ, NOUN, ADV, VERB, NOUN, NOUN, NOUN,...",0.233333,click free download colorfully illustrate page...
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,"[click, DOWNLOAD, mb, green, banner, link]","[VERB, PROPN, NOUN, PROPN, PROPN, PROPN]",-0.2,click DOWNLOAD mb green banner link


In [23]:
# Count occurrences of each label
label_counts = data['label'].value_counts()

# Get the count of 'hate' and 'noHate' labels
hate_count = label_counts.get('hate', 0)
nohate_count = label_counts.get('noHate', 0)

print(f"Number of 'hate' labels: {hate_count}")
print(f"Number of 'noHate' labels: {nohate_count}")


Number of 'hate' labels: 1196
Number of 'noHate' labels: 9507


In [24]:
# Encode labels using LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
filtered_data['label_encoded'] = label_encoder.fit_transform(filtered_data['label'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['label_encoded'] = label_encoder.fit_transform(filtered_data['label'])


In [25]:
filtered_data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,actual_text,tokens,pos_tags,sentiment,preprocessed_text,label_encoded
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...","[March, booklet, download, time, counting]","[PROPN, NOUN, VERB, NOUN, NOUN]",0.0,March booklet download time counting,1
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,"[order, help, increase, booklet, download, gre...","[NOUN, VERB, VERB, NOUN, NOUN, ADJ, NOUN, PROP...",0.4,order help increase booklet download great sto...,1
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,"[simply, copy, paste, following, text, YouTube...","[ADV, VERB, VERB, ADJ, NOUN, PROPN, NOUN, NOUN...",0.0,simply copy paste following text YouTube video...,1
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,"[click, free, download, colorfully, illustrate...","[VERB, ADJ, NOUN, ADV, VERB, NOUN, NOUN, NOUN,...",0.233333,click free download colorfully illustrate page...,0
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,"[click, DOWNLOAD, mb, green, banner, link]","[VERB, PROPN, NOUN, PROPN, PROPN, PROPN]",-0.2,click DOWNLOAD mb green banner link,1


In [26]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_data['preprocessed_text'], filtered_data['label_encoded'], test_size=0.2, random_state=42)

In [27]:
# Apply TF-IDF vectorization with n-grams
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [28]:
# Apply BorderlineSMOTE to oversample the minority class
smote = BorderlineSMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [29]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],  # Adjust this range as needed
    'min_samples_leaf': [1, 2, 4]
}

In [30]:
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

In [31]:
# Initialize GridSearchCV
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')

In [32]:
# Perform grid search
grid_search.fit(X_train_resampled, y_train_resampled)

In [33]:
# Get best parameters
best_params = grid_search.best_params_
print(best_params)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [34]:
# Train Random Forest Classifier with best parameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)
best_rf_classifier.fit(X_train_resampled, y_train_resampled)

In [35]:
# Fine-tune class weights
weighted_rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, class_weight={0: 15, 1: 1})

In [36]:
# Initialize ensemble classifiers
bagging_classifier = BaggingClassifier(weighted_rf_classifier, n_estimators=10, random_state=42)
adaboost_classifier = AdaBoostClassifier(weighted_rf_classifier, n_estimators=50, random_state=42)
gradientboost_classifier = GradientBoostingClassifier(n_estimators=350, random_state=42)


In [37]:
# Train ensemble classifiers
bagging_classifier.fit(X_train_resampled, y_train_resampled)
adaboost_classifier.fit(X_train_resampled, y_train_resampled)
gradientboost_classifier.fit(X_train_resampled, y_train_resampled)



In [38]:
# Predictions
bagging_y_pred = bagging_classifier.predict(X_test_tfidf)
adaboost_y_pred = adaboost_classifier.predict(X_test_tfidf)
gradientboost_y_pred = gradientboost_classifier.predict(X_test_tfidf)

In [39]:
# Evaluate model performance for Bagging Classifier
bagging_accuracy = accuracy_score(y_test, bagging_y_pred)
bagging_report = classification_report(y_test, bagging_y_pred)
bagging_conf_mat = confusion_matrix(y_test, bagging_y_pred)

In [40]:
# Evaluate model performance for AdaBoost Classifier
adaboost_accuracy = accuracy_score(y_test, adaboost_y_pred)
adaboost_report = classification_report(y_test, adaboost_y_pred)
adaboost_conf_mat = confusion_matrix(y_test, adaboost_y_pred)

In [41]:
# Evaluate model performance for Gradient Boosting Classifier
gradientboost_accuracy = accuracy_score(y_test, gradientboost_y_pred)
gradientboost_report = classification_report(y_test, gradientboost_y_pred)
gradientboost_conf_mat = confusion_matrix(y_test, gradientboost_y_pred)

In [42]:
print("\nBagging Classifier Results:")
print(f'Accuracy: {bagging_accuracy}')
print('Classification Report:\n', bagging_report)
print('Confusion Matrix:\n', bagging_conf_mat)

print("\nadaboost Boosting Classifier Results:")
print(f'Accuracy: {adaboost_accuracy}')
print('Classification Report:\n', adaboost_report)
print('Confusion Matrix:\n', adaboost_conf_mat)

print("\nGradient Boosting Classifier Results:")
print(f'Accuracy: {gradientboost_accuracy}')
print('Classification Report:\n', gradientboost_report)
print('Confusion Matrix:\n', gradientboost_conf_mat)


Bagging Classifier Results:
Accuracy: 0.8708293612964728
Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.24      0.28       225
           1       0.91      0.95      0.93      1873

    accuracy                           0.87      2098
   macro avg       0.63      0.59      0.61      2098
weighted avg       0.85      0.87      0.86      2098

Confusion Matrix:
 [[  53  172]
 [  99 1774]]

adaboost Boosting Classifier Results:
Accuracy: 0.882745471877979
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.14      0.21       225
           1       0.90      0.97      0.94      1873

    accuracy                           0.88      2098
   macro avg       0.64      0.56      0.57      2098
weighted avg       0.85      0.88      0.86      2098

Confusion Matrix:
 [[  32  193]
 [  53 1820]]

Gradient Boosting Classifier Results:
Accuracy: 0.8832221163012393
Classificatio

In [43]:
# Find false negatives and false positives
false_negatives_count = len(np.where((y_test == 1) & (gradientboost_y_pred == 0))[0])
false_positives_count = len(np.where((y_test == 0) & (gradientboost_y_pred == 1))[0])

# Print counts of false negatives and false positives
print("False Negatives Count:", false_negatives_count)
print("False Positives Count:", false_positives_count)


False Negatives Count: 88
False Positives Count: 157
