This notebook runs Naive Bayes Classifier with Faizan's preprocessing tools without the augmented text function

Accuracy: 0.678386275803563

In [50]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [51]:
# Load the data
path = './kaggle_sentiment_data.csv'
df = pd.read_csv(path)

In [52]:
# Handle NaN values in the statement column
df['statement'] = df['statement'].fillna('')

In [53]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove links
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

In [54]:
# Tokenization and Stopwords Removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [55]:
# Preprocess the text data
df['cleaned_statement'] = df['statement'].apply(preprocess_text).apply(remove_stopwords)

# Ensure no NaN values
df['cleaned_statement'] = df['cleaned_statement'].fillna('')

# Splitting the data
X = df['cleaned_statement']
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [57]:
# Define parameter grid for MultinomialNB
param_grid = {
    'alpha': [0, 0.0000001, 0.1, 0.5, 1.0, 2.0, 5.0],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [58]:
# GridSearchCV with MultinomialNB
grid_search = GridSearchCV(
    MultinomialNB(),
    param_grid,
    scoring='accuracy',
    cv=cv,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


In [59]:
# Get best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

Best Parameters: {'alpha': 0.1}


In [60]:
# Make predictions
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.678386275803563
                      precision    recall  f1-score   support

             Anxiety       0.77      0.66      0.71       779
             Bipolar       0.82      0.52      0.64       580
          Depression       0.54      0.78      0.64      3100
              Normal       0.85      0.80      0.83      3327
Personality disorder       0.86      0.22      0.35       248
              Stress       0.74      0.21      0.32       557
            Suicidal       0.66      0.56      0.60      2018

            accuracy                           0.68     10609
           macro avg       0.75      0.54      0.58     10609
        weighted avg       0.71      0.68      0.67     10609

