Logistic regression with lemmatization and TfidfVectorizer, no augmented text function

Accuracy: 0.75211160671918

Best C value: 10

In [None]:
## normal logistic regression
## preprocessed data import
import pandas as pd
import re
import nltk
import contractions
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, GridSearchCV

In [5]:
url = './kaggle_sentiment_data.csv'
data = pd.read_csv(url)

# Remove the first column
data = data.drop(data.columns[0], axis=1)
data = data.dropna(subset=['statement', 'status'])

processed_data = data[["statement", "status"]]

In [6]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Ensure all entries in "statement" are strings
processed_data["statement"] = processed_data["statement"].astype(str)

# Define a regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+')

# Define a function to clean text
def clean_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs
    text = url_pattern.sub('', text)
    # Remove non-word and non-whitespace characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d', '', text)
    return text.lower()

# Define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # Convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # Lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    return lemmas

# Apply the cleaning function
processed_data["statement"] = processed_data["statement"].apply(clean_text)

# Tokenize and lemmatize
processed_data["statement"] = processed_data["statement"].apply(
    lambda text: " ".join(lemmatize_tokens(nltk.word_tokenize(text)))
)

In [7]:
X = processed_data['statement']
y = processed_data['status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [9]:
# Logistic Model Training with Hyperparameter Tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000, 10000]
}

LogisticModel = LogisticRegression(max_iter=20000, class_weight='balanced')
grid_search = GridSearchCV(LogisticModel, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Best Model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test_tfidf)

# Evaluation
print("Best Parameters:")
print(grid_search.best_params_)

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Parameters:
{'C': 10}
Accuracy Score:
0.75211160671918
Classification Report:
                      precision    recall  f1-score   support

             Anxiety       0.76      0.79      0.77       768
             Bipolar       0.73      0.77      0.75       556
          Depression       0.73      0.64      0.69      3081
              Normal       0.90      0.92      0.91      3269
Personality disorder       0.60      0.66      0.63       215
              Stress       0.50      0.63      0.56       517
            Suicidal       0.64      0.67      0.66      2131

            accuracy                           0.75     10537
           macro avg       0.70      0.73      0.71     10537
        weighted avg       0.75      0.75      0.75     10537

