In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [24]:
import warnings

# To ignore all warnings globally
warnings.filterwarnings("ignore")

In [25]:
df = pd.read_csv("./processed_data.csv")
df['text'] = df['text'].fillna('')
df.head()

Unnamed: 0,text,afraid,angry,anxious,ashamed,awkward,bored,calm,confused,disgusted,excited,frustrated,happy,jealous,nostalgic,proud,sad,satisfied,surprised
0,my family be the most salient part of my day ...,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,yoga keep me focus i be able to take some tim...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,yesterday my family and i play a bunch of boa...,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0
3,yesterday i visit my parent and have dinner w...,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4,yesterday i really felt the importance of my ...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [26]:
df.shape

(1473, 19)

In [27]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Extract features (text) and labels
X_train = train_data['text']
X_test = test_data['text']
y_train = train_data.iloc[:, 1:]
y_test = test_data.iloc[:, 1:]

In [28]:
# Create a pipeline with CountVectorizer and Logistic Regression
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=100)))
])

# Fit the pipeline on the training data
text_clf.fit(X_train, y_train)

# Predictions
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc_scores = roc_auc_score(y_test, text_clf.predict_proba(X_test), average='micro')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print(f"ROC AUC: {roc_auc_scores}")

Accuracy: 0.10508474576271186
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         5
           2       0.88      0.30      0.45        23
           3       1.00      0.25      0.40         4
           4       0.00      0.00      0.00         4
           5       1.00      0.13      0.24        15
           6       0.59      0.28      0.37        87
           7       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         3
           9       0.31      0.09      0.14        46
          10       0.80      0.29      0.42        28
          11       0.70      0.58      0.64       153
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         6
          14       0.55      0.33      0.42        66
          15       0.00      0.00      0.00         6
          16       0.57    

In [29]:
# Create a pipeline with CountVectorizer and Random Forst
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', OneVsRestClassifier(RandomForestClassifier()))
])

# Fit the pipeline on the training data
text_clf.fit(X_train, y_train)

# Predictions
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
roc_auc_scores = roc_auc_score(y_test, text_clf.predict_proba(X_test), average='micro')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)
print(f"ROC AUC: {roc_auc_scores}")

Accuracy: 0.10847457627118644
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00        23
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00        15
           6       0.75      0.14      0.23        87
           7       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         3
           9       0.50      0.09      0.15        46
          10       0.88      0.25      0.39        28
          11       0.69      0.60      0.64       153
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         6
          14       0.68      0.29      0.40        66
          15       0.00      0.00      0.00         6
          16       0.65    

In [30]:
entry = "Today has been a long day. A little tiring but full of surprises. I got my first acceptance today. I was on cloud nine!!! But I did not get place to sit in the train. but thats only a minor inconvienence. Im less stressed now."
prediction = text_clf.predict([entry])

print("Predicted labels:", prediction)

Predicted labels: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [33]:
entry = "happy happy sad surprised"
prediction = text_clf.predict([entry])

print("Predicted labels:", prediction)

Predicted labels: [[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]]
