This notebook runs the Naive Bayes Classifier with lemmatization as the preprocessing tool.

Accuracy: 0.6617633102401063

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from nltk.tokenize import word_tokenize
import contractions
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob

nltk.download('punkt_tab')

url = './kaggle_sentiment_data.csv'
data = pd.read_csv(url)

# Remove the first column
data = data.drop(data.columns[0], axis=1)
data = data.dropna(subset=['statement', 'status'])

print(data.head())
processed_data = data[["statement", "status"]]

print(processed_data.head())

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vallipaladugu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety
                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleeping, confused mind, restless hear...  Anxiety
2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3  I've shifted my focus to something else but I'...  Anxiety
4  I'm restless and restless, it's been a month n...  Anxiety


In [2]:
# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Ensure all entries in "statement" are strings
processed_data["statement"] = processed_data["statement"].astype(str)

# Define a regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+')

# Define a function to clean text
def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs
    text = url_pattern.sub('', text)
    # Remove non-word and non-whitespace characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d', '', text)
    return text.lower()

# Define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # Convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # Lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmas

# Apply the cleaning function
processed_data["statement"] = processed_data["statement"].apply(clean_text)

# Tokenize and lemmatize
processed_data["statement"] = processed_data["statement"].apply(
    lambda text: " ".join(lemmatize_tokens(nltk.word_tokenize(text)))
)

print(processed_data.head())

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vallipaladugu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vallipaladugu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vallipaladugu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/vallipaladugu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


                                           statement   status
0                                         oh my gosh  Anxiety
1  trouble sleep confuse mind restless heart all ...  Anxiety
2  all wrong back off dear forward doubt stay in ...  Anxiety
3  i have shift my focus to something else but i ...  Anxiety
4  i be restless and restless it be be a month no...  Anxiety


In [3]:
# Splitting the data before augmentation
X = processed_data['statement']
y = processed_data['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=10000)  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
# Define the parameter grid for MultinomialNB
param_grid = {
    'alpha': [0, 0.0000001, 0.1, 0.5, 1.0, 2.0, 5.0],  # Smoothing parameter
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with MultinomialNB and the parameter grid
grid_search = GridSearchCV(
    MultinomialNB(),
    param_grid,
    scoring='accuracy',  # Metric for evaluation
    cv=cv,                # 5-fold cross-validation
    verbose=1,           # Display progress
    n_jobs=-1            # Use all available processors
)

# Perform grid search
grid_search.fit(X_train_tfidf, y_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


Best Parameters: {'alpha': 0.1}


In [6]:
# Make predictions
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6617633102401063
                      precision    recall  f1-score   support

             Anxiety       0.81      0.65      0.72       755
             Bipolar       0.84      0.45      0.59       527
          Depression       0.49      0.80      0.61      3016
              Normal       0.88      0.78      0.83      3308
Personality disorder       0.92      0.15      0.26       237
              Stress       0.77      0.16      0.27       536
            Suicidal       0.67      0.53      0.59      2158

            accuracy                           0.66     10537
           macro avg       0.77      0.50      0.55     10537
        weighted avg       0.72      0.66      0.66     10537

