In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [32]:
import warnings

# To ignore all warnings globally
warnings.filterwarnings("ignore")

In [41]:
df = pd.read_csv("./processed_data.csv")
df['text'] = df['text'].fillna('')
df.head()

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,that game hurt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,sexuality should not be a group category it m...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,you do right if you do not care then fuck them,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,man i love reddit,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,name be nowhere near them he be by the falcon,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [42]:
df.shape

(154132, 29)

In [43]:
df = df.dropna()
df.shape

(154132, 29)

In [44]:
df=df.drop(columns=['neutral'])

In [45]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Extract features (text) and labels
X_train = train_data['text']
X_test = test_data['text']
y_train = train_data.iloc[:, 1:]
y_test = test_data.iloc[:, 1:]

In [10]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

# Use oversampling for each label
X_resampled, y_resampled = [], []
for i in range(y_train.shape[1]):
    # Reshape X_train to be 2D
    X_res, y_res = ros.fit_resample(np.array(X_train).reshape(-1, 1), y_train.iloc[:, i])
    X_resampled.append(X_res)
    y_resampled.append(y_res)

In [53]:
X_train

134653    i have hear some people get kind of homophobic...
121645                        ok cheer  i be ban from there
109938     the makeup be very technically sound you have...
43802                    do not blame me  i vote for kodos 
106336                                         how the hell
                                ...                        
119879    i can not see a way forward  neither the gover...
103694    i would try rub alcohol  high percentage  look...
131932     name  literally do nothing and get a foul cal...
146867                                       what the frick
121958                can not imagine take on that contract
Name: text, Length: 123305, dtype: object

In [62]:
# Create a pipeline with TfidfVectorizer and Logistic Regression
text_clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=100)))
])

# Fit the pipeline on the training data
try:
    text_clf.fit(X_train, y_train)
except ValueError as e:
        print(f"Error on line {e}")

# Predictions
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)


Accuracy: 0.25474421773120964
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.15      0.24      2660
           1       0.50      0.17      0.25      1325
           2       0.48      0.06      0.10      1326
           3       0.36      0.01      0.02      2323
           4       0.40      0.01      0.03      2939
           5       0.43      0.03      0.06       983
           6       0.43      0.03      0.06      1185
           7       0.45      0.03      0.06      1496
           8       0.48      0.04      0.08       654
           9       0.40      0.00      0.01      1472
          10       0.28      0.01      0.02      1851
          11       0.61      0.05      0.09       900
          12       0.75      0.02      0.04       409
          13       0.46      0.04      0.07       956
          14       0.59      0.06      0.12       525
          15       0.84      0.59      0.69      1508
          16       0.00    

In [63]:
from sklearn.metrics import roc_auc_score

roc_auc_scores = roc_auc_score(y_test, text_clf.predict_proba(X_test), average='micro')
print(f"ROC AUC: {roc_auc_scores}")

ROC AUC: 0.8522450777887539


In [64]:
entry = "Today has been a long day. A little tiring but full of surprises. I got my first acceptance today. I was on cloud nine!!! But I did not get place to sit in the train. but thats only a minor inconvienence. Im less stressed now."
prediction = text_clf.predict([entry])

print("Predicted labels:", prediction)

Predicted labels: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
