This notebook run a Support Vector Machine model with GridSearchCV to find the best parameters with Faizan's preprocessing tools excluding the augmented text function.

Best Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}

Accuracy: 0.77603921198982

In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Load the data
path = './kaggle_sentiment_data.csv'
df = pd.read_csv(path)

In [4]:
# Handle NaN values in the statement column
df['statement'] = df['statement'].fillna('')

In [5]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove links
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    return text

In [6]:
# Tokenization and Stopwords Removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [7]:
# Preprocess the text data
df['cleaned_statement'] = df['statement'].apply(preprocess_text).apply(remove_stopwords)

# Ensure no NaN values
df['cleaned_statement'] = df['cleaned_statement'].fillna('')

# Splitting the data
X = df['cleaned_statement']
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a Pipeline with TF-IDF and SVM
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to numerical features
    ('svm', SVC())                # SVM classifier
])

In [9]:
# Define the GridSearchCV Parameters
param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams and Bigrams
    'svm__C': [0.1, 1, 10],  # Regularization parameter
    'svm__kernel': ['linear', 'rbf'],  # Kernel types
    'svm__gamma': ['scale', 'auto']    # Kernel coefficient
}

In [10]:
# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 6.2min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 6.3min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 6.3min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 6.3min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 1); total time= 6.3min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 6.8min
[CV] END svm__C=0.1, svm__gamma=scale, svm__kernel=linear, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time= 6.9min
[CV] END svm__C=0.1, 

In [14]:
# Best Parameters and Score
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 1)}


In [15]:
# Evaluate on Test Data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.77603921198982
Classification Report:
                       precision    recall  f1-score   support

             Anxiety       0.82      0.81      0.82       779
             Bipolar       0.91      0.69      0.78       580
          Depression       0.71      0.75      0.73      3100
              Normal       0.85      0.96      0.90      3327
Personality disorder       0.91      0.48      0.63       248
              Stress       0.77      0.50      0.61       557
            Suicidal       0.68      0.64      0.66      2018

            accuracy                           0.78     10609
           macro avg       0.81      0.69      0.73     10609
        weighted avg       0.78      0.78      0.77     10609

