In [30]:
import pandas as pd
import numpy as np

import re 
import random
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import hstack

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings 
warnings.filterwarnings("ignore")


In [31]:
df = pd.read_csv('data.csv')

In [32]:
df.dropna(inplace = True)

In [33]:
status_counts = df['status'].value_counts()

In [41]:
random_statements = df.groupby('status')['statement'].apply(lambda x: x.sample(n=1).iloc[0])

In [42]:
df['num_of_characters'] = df['statement'].str.len()
df['num_of_sentences'] = df['statement'].apply(lambda x: len(nltk.sent_tokenize(x)))

description = df[['num_of_characters', 'num_of_sentences']].describe()

In [35]:
df.rename(columns={'statement': 'original_statement'}, inplace=True)

In [36]:
df['statement']=df['original_statement'].str.lower()


In [37]:
def remove_patterns(text):

    '''
    This function removes all URLs, markdown-style links,
    handels, and punctuation/other special characters
    '''
    
    text = re.sub(r'http[s]?://\S+', '', text)

    text = re.sub(r'\[.*?\]\(.*?\)', '', text)

    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    
    return text.strip()

df['statement'] = df['statement'].apply(remove_patterns)

In [38]:
df['tokens'] = df['statement'].apply(word_tokenize)

In [None]:
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return ' '.join(stemmer.stem(str(token)) for token in tokens)

df['tokens_stemmed'] = df['tokens'].apply(stem_tokens)

Unnamed: 0.1,Unnamed: 0,original_statement,status,statement,tokens,tokens_stemmed
0,0,oh my gosh,Anxiety,oh my gosh,"[oh, my, gosh]",oh my gosh
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...,"[trouble, sleeping, confused, mind, restless, ...",troubl sleep confus mind restless heart all ou...
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,all wrong back off dear forward doubt stay in ...,"[all, wrong, back, off, dear, forward, doubt, ...",all wrong back off dear forward doubt stay in ...
3,3,I've shifted my focus to something else but I'...,Anxiety,ive shifted my focus to something else but im ...,"[ive, shifted, my, focus, to, something, else,...",ive shift my focu to someth els but im still w...
4,4,"I'm restless and restless, it's been a month n...",Anxiety,im restless and restless its been a month now ...,"[im, restless, and, restless, its, been, a, mo...",im restless and restless it been a month now b...


In [47]:
X = df[['tokens_stemmed', 'num_of_characters', 'num_of_sentences']]
y = df['status']

In [None]:
lbl_enc = LabelEncoder()
y = lbl_enc.fit_transform(y.values)

# could use one hot

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

Below we could use Word2Vec

In [50]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train['tokens_stemmed'])
X_test_tfidf = vectorizer.transform(X_test['tokens_stemmed'])

# 2. Extract numerical features
X_train_num = X_train[['num_of_characters', 'num_of_sentences']].values
X_test_num = X_test[['num_of_characters', 'num_of_sentences']].values

# 3. Combine TF-IDF features with numerical features
X_train_combined = hstack([X_train_tfidf, X_train_num])
X_test_combined = hstack([X_test_tfidf, X_test_num])

print('Number of feature words: ', len(vectorizer.get_feature_names_out()))

Number of feature words:  50000


In [51]:
X_train_combined.shape

(42144, 50002)

Oversampling

In [52]:
ros = RandomOverSampler(random_state=101)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_combined, y_train)

In [53]:
classifiers = {
    'XGB': XGBClassifier(learning_rate=0.2, max_depth=7, n_estimators=500, random_state=101, tree_method='gpu_hist')
}

In [None]:
accuracy_scores = []

for name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_pred = clf.predict(X_test_combined)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("For", name)
    print("Accuracy:", accuracy)



For XGB
Accuracy: 0.7978551769953497
