In [18]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score





In [10]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [12]:
df['clean_text'] = df['text'].apply(clean_text)

In [14]:
def extract_features(vectorizer, train_texts, test_texts):
    train_features = vectorizer.fit_transform(train_texts)
    test_features = vectorizer.transform(test_texts)
    return train_features, test_features

In [16]:
def train_models(X_train, X_test, y_train, y_test):
    models = {
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    # Ensemble Model
    ensemble = VotingClassifier(estimators=[('NB', models['Naive Bayes']), ('RF', models['Random Forest']), ('XGB', models['XGBoost'])], voting='hard')
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    print(f"Ensemble Model Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Train with and without cleaning
for feature_type, vectorizer in zip(["BoW", "TF-IDF"], [CountVectorizer(), TfidfVectorizer()]):
    print(f"\nUsing {feature_type} Features Without Cleaning")
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
    X_train_vec, X_test_vec = extract_features(vectorizer, X_train, X_test)
    train_models(X_train_vec, X_test_vec, y_train, y_test)
    
    print(f"\nUsing {feature_type} Features With Cleaning")
    X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)
    X_train_vec, X_test_vec = extract_features(vectorizer, X_train, X_test)
    train_models(X_train_vec, X_test_vec, y_train, y_test)



Using BoW Features Without Cleaning
Naive Bayes Accuracy: 0.9839
Random Forest Accuracy: 0.9758
XGBoost Accuracy: 0.9776


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9830

Using BoW Features With Cleaning
Naive Bayes Accuracy: 0.9812
Random Forest Accuracy: 0.9731
XGBoost Accuracy: 0.9731


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9767

Using TF-IDF Features Without Cleaning
Naive Bayes Accuracy: 0.9623
Random Forest Accuracy: 0.9749


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9767


Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9749

Using TF-IDF Features With Cleaning
Naive Bayes Accuracy: 0.9686
Random Forest Accuracy: 0.9749


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9704


Parameters: { "use_label_encoder" } are not used.



Ensemble Model Accuracy: 0.9749
