In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report


from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [4]:
import warnings

# To ignore all warnings globally
warnings.filterwarnings("ignore")

In [5]:
df = pd.read_csv("./processed_data1.csv")
df['text'] = df['text'].fillna('')
df.head()

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,that game hurt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,sexuality should not be a group category it m...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,you do right if you do not care then fuck them,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,man i love reddit,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,name be nowhere near them he be by the falcon,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
df.shape

(70000, 29)

In [7]:
df = df.dropna()

In [8]:
df=df.drop(columns=['neutral'])

In [9]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Extract features (text) and labels
X_train = train_data['text']
X_test = test_data['text']
y_train = train_data.iloc[:, 1:]
y_test = test_data.iloc[:, 1:]

In [10]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

# Use oversampling for each label
X_resampled, y_resampled = [], []
for i in range(y_train.shape[1]):
    # Reshape X_train to be 2D
    X_res, y_res = ros.fit_resample(np.array(X_train).reshape(-1, 1), y_train.iloc[:, i])
    X_resampled.append(X_res)
    y_resampled.append(y_res)

In [11]:
# Create a pipeline with CountVectorizer and Logistic Regression
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=100)))
])

# Fit the pipeline on the training data
text_clf.fit(X_train, y_train)

# Predictions
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)


Accuracy: 0.344
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.28      0.38      1132
           1       0.56      0.37      0.44       621
           2       0.46      0.10      0.16       519
           3       0.28      0.03      0.06       946
           4       0.41      0.05      0.09      1132
           5       0.36      0.05      0.09       397
           6       0.54      0.05      0.10       500
           7       0.53      0.05      0.09       680
           8       0.29      0.06      0.10       242
           9       0.31      0.03      0.05       574
          10       0.41      0.03      0.05       754
          11       0.45      0.08      0.13       328
          12       0.50      0.06      0.10       158
          13       0.51      0.06      0.11       378
          14       0.61      0.20      0.31       210
          15       0.88      0.76      0.82       816
          16       0.00      0.00      0.

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the pipeline
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', OneVsRestClassifier(LogisticRegression(max_iter=100)))
])

# Define the parameter grid
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # You can adjust the n-gram range
    'vectorizer__max_features': [1000, 5000, 10000],  # You can adjust the max_features
    'classifier__estimator__C': [0.1, 1, 10],  # Adjust regularization parameter for Logistic Regression
}

# Perform GridSearchCV
grid_search = GridSearchCV(text_clf, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters from the grid search
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_params)

# Predictions using the best estimator
y_pred_grid = grid_search.best_estimator_.predict(X_test)

# Evaluate the model with the best parameters
accuracy_grid = accuracy_score(y_test, y_pred_grid)
classification_rep_grid = classification_report(y_test, y_pred_grid)

print(f"Accuracy (GridSearchCV): {accuracy_grid}")
print("Classification Report (GridSearchCV):\n", classification_rep_grid)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'classifier__estimator__C': 1, 'vectorizer__max_features': 1000, 'vectorizer__ngram_range': (1, 2)}
Accuracy (GridSearchCV): 0.3445714285714286
Classification Report (GridSearchCV):
               precision    recall  f1-score   support

           0       0.63      0.25      0.36      1132
           1       0.59      0.37      0.45       621
           2       0.42      0.06      0.11       519
           3       0.28      0.01      0.02       946
           4       0.47      0.03      0.06      1132
           5       0.41      0.04      0.07       397
           6       0.59      0.04      0.07       500
           7       0.72      0.04      0.07       680
           8       0.30      0.05      0.09       242
           9       0.44      0.03      0.05       574
          10       0.58      0.01      0.03       754
          11       0.48      0.07      0.12       328
          12       0.47      0.05  

In [17]:
# Create a pipeline with CountVectorizer and Random Forst
text_clf = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', OneVsRestClassifier(RandomForestClassifier()))
])

# Fit the pipeline on the training data
text_clf.fit(X_train[:2000], y_train[:2000])

# Predictions
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)


Accuracy: 0.3172857142857143
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.11      0.19      1132
           1       0.54      0.16      0.25       621
           2       0.42      0.11      0.18       519
           3       0.26      0.03      0.05       946
           4       0.16      0.03      0.05      1132
           5       0.22      0.03      0.05       397
           6       0.16      0.02      0.03       500
           7       0.06      0.01      0.02       680
           8       0.20      0.00      0.01       242
           9       0.18      0.00      0.01       574
          10       0.08      0.00      0.00       754
          11       0.44      0.01      0.02       328
          12       0.12      0.01      0.01       158
          13       0.08      0.01      0.02       378
          14       0.43      0.03      0.05       210
          15       0.87      0.75      0.80       816
          16       0.00     

In [15]:
from sklearn.metrics import roc_auc_score

roc_auc_scores = roc_auc_score(y_test, grid_search.predict_proba(X_test), average='micro')
print(f"ROC AUC: {roc_auc_scores}")

ROC AUC: 0.8306831242679411


In [None]:
entry = "Today has been a long day. A little tiring but full of surprises. I got my first acceptance today. I was on cloud nine!!! But I did not get place to sit in the train. but thats only a minor inconvienence. Im less stressed now."
prediction = text_clf.predict([entry])

print("Predicted labels:", prediction)

Predicted labels: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
