Dataset Balancing

In [None]:
import pandas as pd

df = pd.read_csv('/content/MisogynisticAttitudeDetection.csv')

print("SubTask1 class distribution:\n", df['SubTask1'].value_counts(), "\n")
print("SubTask2 class distribution:\n", df['SubTask2'].value_counts(), "\n")

df['Combined'] = df['SubTask1'] + ' + ' + df['SubTask2']
combined_counts = df['Combined'].value_counts()

print("Combined SubTask1 + SubTask2 class distribution:\n", combined_counts)


SubTask1 class distribution:
 SubTask1
Neutral        7140
Pessimistic    3929
Optimistic     1629
Name: count, dtype: int64 

SubTask2 class distribution:
 SubTask2
Nothing         7725
Criticism       2436
Offensive       1493
Suggestion       639
Appreciation     405
Name: count, dtype: int64 

Combined SubTask1 + SubTask2 class distribution:
 Combined
Neutral + Nothing            7140
Pessimistic + Criticism      2436
Pessimistic + Offensive      1493
Optimistic + Suggestion       639
Optimistic + Nothing          585
Optimistic + Appreciation     405
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.utils import resample

df = pd.read_csv('/content/MisogynisticAttitudeDetection.csv')
df['Combined'] = df['SubTask1'] + ' + ' + df['SubTask2']

upsample_targets = {
    'Pessimistic + Criticism': 2436 + 1605,
    'Pessimistic + Offensive': 1493 + 1606,
    'Optimistic + Suggestion': 639 + 1837,
    'Optimistic + Nothing': 585 + 1837,
    'Optimistic + Appreciation': 405 + 1837,
}

upsampled_dfs = []

for label, target_count in upsample_targets.items():
    subset = df[df['Combined'] == label]
    upsampled = resample(subset, replace=True, n_samples=target_count, random_state=42)
    upsampled_dfs.append(upsampled)

non_upsample_df = df[~df['Combined'].isin(upsample_targets.keys())]
final_df = pd.concat([non_upsample_df] + upsampled_dfs)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
final_df.to_csv('/content/balanced_dataset.csv', index=False)

final_df['Combined'].value_counts()


Unnamed: 0_level_0,count
Combined,Unnamed: 1_level_1
Neutral + Nothing,7140
Pessimistic + Criticism,4041
Pessimistic + Offensive,3099
Optimistic + Suggestion,2476
Optimistic + Nothing,2422
Optimistic + Appreciation,2242


In [None]:
import pandas as pd

df = pd.read_csv('/content/balanced_dataset.csv')

print("SubTask1 class distribution:\n", df['SubTask1'].value_counts(), "\n")
print("SubTask2 class distribution:\n", df['SubTask2'].value_counts(), "\n")

df['Combined'] = df['SubTask1'] + ' + ' + df['SubTask2']
combined_counts = df['Combined'].value_counts()

print("Combined SubTask1 + SubTask2 class distribution:\n", combined_counts)


SubTask1 class distribution:
 SubTask1
Neutral        7140
Pessimistic    7140
Optimistic     7140
Name: count, dtype: int64 

SubTask2 class distribution:
 SubTask2
Nothing         9562
Criticism       4041
Offensive       3099
Suggestion      2476
Appreciation    2242
Name: count, dtype: int64 

Combined SubTask1 + SubTask2 class distribution:
 Combined
Neutral + Nothing            7140
Pessimistic + Criticism      4041
Pessimistic + Offensive      3099
Optimistic + Suggestion      2476
Optimistic + Nothing         2422
Optimistic + Appreciation    2242
Name: count, dtype: int64


Preprocessing

In [None]:
!pip install emoji


Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import pandas as pd
import re
import emoji

df = pd.read_csv('balanced_dataset.csv')

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Cleaned_Comments'] = df['Comments'].apply(clean_text)
df.to_csv('cleaned_balanced_dataset.csv', index=False)


KNN

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

dataset = pd.read_csv('/content/cleaned_balanced_dataset.csv')

subtask1_mapping = {'Optimistic': 0, 'Pessimistic': 1, 'Neutral': 2}
subtask2_mapping = {'Appreciation': 0, 'Suggestion': 1, 'Criticism': 2, 'Offensive': 3, 'Nothing': 4}

dataset['SubTask1_Num'] = dataset['SubTask1'].map(subtask1_mapping)
dataset['SubTask2_Num'] = dataset['SubTask2'].map(subtask2_mapping)

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
X = model.encode(dataset['Comments'].tolist(), show_progress_bar=True)

y1 = dataset['SubTask1_Num'].values
y2 = dataset['SubTask2_Num'].values

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

knn1 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn1.fit(X_train, y1_train)

subtask2_models = {}
for label in subtask1_mapping.values():
    subset_indices = y1_train == label
    if np.any(subset_indices):
        knn2 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
        knn2.fit(X_train[subset_indices], y2_train[subset_indices])
        subtask2_models[label] = knn2

subtask1_preds = knn1.predict(X_test)

subtask2_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        pred = subtask2_models[subtask1_pred].predict(X_test[i].reshape(1, -1))[0]
        subtask2_preds.append(pred)
    else:
        subtask2_preds.append(None)

print("SubTask1 Classification Report:\n", classification_report(y1_test, subtask1_preds))
print("SubTask2 Classification Report:\n", classification_report(y2_test, subtask2_preds))


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/670 [00:00<?, ?it/s]

SubTask1 Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.89      0.74      1408
           1       0.62      0.69      0.66      1422
           2       0.70      0.35      0.46      1454

    accuracy                           0.64      4284
   macro avg       0.65      0.64      0.62      4284
weighted avg       0.65      0.64      0.62      4284

SubTask2 Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.90      0.72       415
           1       0.58      0.76      0.66       514
           2       0.49      0.52      0.50       817
           3       0.46      0.54      0.50       605
           4       0.77      0.55      0.64      1933

    accuracy                           0.60      4284
   macro avg       0.58      0.66      0.60      4284
weighted avg       0.63      0.60      0.60      4284



Logistic Regresssion

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

file_path = '/content/cleaned_balanced_dataset.csv'
dataset = pd.read_csv(file_path)

subtask1_mapping = {'Optimistic': 0, 'Pessimistic': 1, 'Neutral': 2}
subtask2_mapping = {'Appreciation': 0, 'Suggestion': 1, 'Criticism': 2, 'Offensive': 3, 'Nothing': 4}
default_subtask2_label = subtask2_mapping['Nothing']

dataset['SubTask1_Num'] = dataset['SubTask1'].map(subtask1_mapping)
dataset['SubTask2_Num'] = dataset['SubTask2'].map(subtask2_mapping)

embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
X = embedder.encode(dataset['Comments'].tolist(), show_progress_bar=True)

y1 = dataset['SubTask1_Num'].values
y2 = dataset['SubTask2_Num'].values

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

lr1 = LogisticRegression(C=1.0, solver='lbfgs', max_iter=100, penalty='l2')
lr1.fit(X_train, y1_train)

subtask2_models = {}
for label in subtask1_mapping.values():
    subset_indices = y1_train == label
    y2_subset = y2_train[subset_indices]
    if len(np.unique(y2_subset)) > 1:
        lr2 = LogisticRegression(C=1.0, solver='lbfgs', max_iter=100, penalty='l2')
        lr2.fit(X_train[subset_indices], y2_subset)
        subtask2_models[label] = lr2

subtask1_preds = lr1.predict(X_test)

subtask2_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        pred = subtask2_models[subtask1_pred].predict(X_test[i].reshape(1, -1))[0]
        subtask2_preds.append(pred)
    else:
        subtask2_preds.append(default_subtask2_label)

print("SubTask1 Classification Report:\n", classification_report(y1_test, subtask1_preds))
print("SubTask2 Classification Report:\n", classification_report(y2_test, subtask2_preds))


Batches:   0%|          | 0/670 [00:00<?, ?it/s]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SubTask1 Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.64      0.63      1408
           1       0.58      0.57      0.58      1422
           2       0.58      0.56      0.57      1454

    accuracy                           0.59      4284
   macro avg       0.59      0.59      0.59      4284
weighted avg       0.59      0.59      0.59      4284

SubTask2 Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.54      0.52       415
           1       0.46      0.53      0.49       514
           2       0.43      0.44      0.43       817
           3       0.41      0.38      0.39       605
           4       0.66      0.63      0.65      1933

    accuracy                           0.54      4284
   macro avg       0.49      0.50      0.50      4284
weighted avg       0.54      0.54      0.54      4284



SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

file_path = '/content/cleaned_balanced_dataset.csv'
dataset = pd.read_csv(file_path)

subtask1_mapping = {'Optimistic': 0, 'Pessimistic': 1, 'Neutral': 2}
subtask2_mapping = {'Appreciation': 0, 'Suggestion': 1, 'Criticism': 2, 'Offensive': 3, 'Nothing': 4}
default_subtask2_label = subtask2_mapping['Nothing']

dataset['SubTask1_Num'] = dataset['SubTask1'].map(subtask1_mapping)
dataset['SubTask2_Num'] = dataset['SubTask2'].map(subtask2_mapping)

print("Loading SentenceTransformer model...")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

print("Encoding comments into embeddings...")
X = model.encode(dataset['Comments'].tolist(), show_progress_bar=True)

y1 = dataset['SubTask1_Num']
y2 = dataset['SubTask2_Num']

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

print("Training SVM for SubTask1...")
svm1 = SVC(kernel='linear', C=0.1)
svm1.fit(X_train, y1_train)

subtask2_models = {}
for label in subtask1_mapping.values():
    subset_indices = y1_train == label
    y2_subset = y2_train[subset_indices]
    if len(np.unique(y2_subset)) > 1:
        svm2 = SVC(kernel='linear', C=0.1)
        svm2.fit(X_train[subset_indices], y2_subset)
        subtask2_models[label] = svm2

subtask1_preds = svm1.predict(X_test)

subtask2_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        subtask2_preds.append(subtask2_models[subtask1_pred].predict(X_test[i].reshape(1, -1))[0])
    else:
        subtask2_preds.append(default_subtask2_label)

print("\nSubTask1 Classification Report:\n", classification_report(y1_test, subtask1_preds))
print("\nSubTask2 Classification Report:\n", classification_report(y2_test, subtask2_preds))


Loading SentenceTransformer model...
Encoding comments into embeddings...


Batches:   0%|          | 0/670 [00:00<?, ?it/s]

Training SVM for SubTask1...

SubTask1 Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.61      0.60      1408
           1       0.56      0.59      0.58      1422
           2       0.58      0.54      0.56      1454

    accuracy                           0.58      4284
   macro avg       0.58      0.58      0.58      4284
weighted avg       0.58      0.58      0.58      4284


SubTask2 Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.53      0.51       415
           1       0.46      0.50      0.48       514
           2       0.39      0.48      0.43       817
           3       0.40      0.34      0.37       605
           4       0.66      0.62      0.64      1933

    accuracy                           0.53      4284
   macro avg       0.48      0.49      0.49      4284
weighted avg       0.54      0.53      0.53      4284



Decision Tree

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

file_path = '/content/cleaned_balanced_dataset.csv'
dataset = pd.read_csv(file_path)

subtask1_mapping = {'Optimistic': 0, 'Pessimistic': 1, 'Neutral': 2}
subtask2_mapping = {'Appreciation': 0, 'Suggestion': 1, 'Criticism': 2, 'Offensive': 3, 'Nothing': 4}
default_subtask2_label = subtask2_mapping['Nothing']

dataset['SubTask1_Num'] = dataset['SubTask1'].map(subtask1_mapping)
dataset['SubTask2_Num'] = dataset['SubTask2'].map(subtask2_mapping)

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
X = model.encode(dataset['Comments'], show_progress_bar=True)

y1 = dataset['SubTask1_Num']
y2 = dataset['SubTask2_Num']

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

dt1 = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=10)
dt1.fit(X_train, y1_train)

train_acc_y1 = accuracy_score(y1_train, dt1.predict(X_train))
test_acc_y1 = accuracy_score(y1_test, dt1.predict(X_test))
print(f"SubTask1 - Train Accuracy: {train_acc_y1:.4f}")
print(f"SubTask1 - Test Accuracy: {test_acc_y1:.4f}")

subtask2_models = {}
for label in subtask1_mapping.values():
    subset_indices = y1_train == label
    y2_subset = y2_train[subset_indices]
    if len(np.unique(y2_subset)) > 1:
        dt2 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=10)
        dt2.fit(X_train[subset_indices], y2_subset)
        subtask2_models[label] = dt2

subtask1_preds = dt1.predict(X_test)

subtask2_preds = []
subtask2_train_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        pred = subtask2_models[subtask1_pred].predict(X_test[i].reshape(1, -1))[0]
        subtask2_preds.append(pred)
    else:
        subtask2_preds.append(default_subtask2_label)

for i in range(len(X_train)):
    label = y1_train.iloc[i]
    if label in subtask2_models:
        pred = subtask2_models[label].predict(X_train[i].reshape(1, -1))[0]
        subtask2_train_preds.append(pred)
    else:
        subtask2_train_preds.append(default_subtask2_label)

train_acc_y2 = accuracy_score(y2_train, subtask2_train_preds)
test_acc_y2 = accuracy_score(y2_test, subtask2_preds)
print(f"SubTask2 - Train Accuracy: {train_acc_y2:.4f}")
print(f"SubTask2 - Test Accuracy: {test_acc_y2:.4f}")

print("SubTask1 Classification Report:\n", classification_report(y1_test, subtask1_preds))
print("SubTask2 Classification Report:\n", classification_report(y2_test, subtask2_preds))


Batches:   0%|          | 0/670 [00:00<?, ?it/s]

SubTask1 - Train Accuracy: 0.7615
SubTask1 - Test Accuracy: 0.6424
SubTask2 - Train Accuracy: 0.9455
SubTask2 - Test Accuracy: 0.6324
SubTask1 Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.74      1408
           1       0.59      0.68      0.63      1422
           2       0.61      0.50      0.55      1454

    accuracy                           0.64      4284
   macro avg       0.64      0.64      0.64      4284
weighted avg       0.64      0.64      0.64      4284

SubTask2 Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.79      0.74       415
           1       0.69      0.68      0.69       514
           2       0.52      0.62      0.57       817
           3       0.48      0.51      0.50       605
           4       0.72      0.63      0.67      1933

    accuracy                           0.63      4284
   macro avg       0.62      0.65      

MNB

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

file_path = '/content/cleaned_balanced_dataset.csv'
dataset = pd.read_csv(file_path)

subtask1_mapping = {'Optimistic': 0, 'Pessimistic': 1, 'Neutral': 2}
subtask2_mapping = {'Appreciation': 0, 'Suggestion': 1, 'Criticism': 2, 'Offensive': 3, 'Nothing': 4}
default_subtask2_label = subtask2_mapping['Nothing']

dataset['SubTask1_Num'] = dataset['SubTask1'].map(subtask1_mapping)
dataset['SubTask2_Num'] = dataset['SubTask2'].map(subtask2_mapping)

embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
X = embedder.encode(dataset['Comments'].tolist(), show_progress_bar=True)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y1_train, y1_test = train_test_split(X_scaled, dataset['SubTask1_Num'].values, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X_scaled, dataset['SubTask2_Num'].values, test_size=0.2, random_state=42)

mnb1 = MultinomialNB()
mnb1.fit(X_train, y1_train)

subtask2_models = {}
for label in subtask1_mapping.values():
    indices = y1_train == label
    y2_subset = y2_train[indices]
    if len(np.unique(y2_subset)) > 1:
        mnb2 = MultinomialNB()
        mnb2.fit(X_train[indices], y2_subset)
        subtask2_models[label] = mnb2

subtask1_preds = mnb1.predict(X_test)

subtask2_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        pred = subtask2_models[subtask1_pred].predict(X_test[i].reshape(1, -1))[0]
        subtask2_preds.append(pred)
    else:
        subtask2_preds.append(default_subtask2_label)

print("\nSubTask1 Classification Report:\n", classification_report(y1_test, subtask1_preds))
print("\nSubTask2 Classification Report:\n", classification_report(y2_test, subtask2_preds))


Generating sentence embeddings...


Batches:   0%|          | 0/670 [00:00<?, ?it/s]

Scaling features for MultinomialNB...
Training MNB for SubTask1...
Training separate MNBs for SubTask2 by SubTask1 category...

SubTask1 Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.35      0.41      1408
           1       0.48      0.46      0.47      1422
           2       0.45      0.61      0.52      1454

    accuracy                           0.47      4284
   macro avg       0.48      0.47      0.47      4284
weighted avg       0.48      0.47      0.47      4284


SubTask2 Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.31      0.34       415
           1       0.28      0.23      0.26       514
           2       0.33      0.54      0.41       817
           3       0.21      0.01      0.03       605
           4       0.58      0.64      0.61      1933

    accuracy                           0.45      4284
   macro avg       0.35      0.35      0.33 

MLP

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

dataset = pd.read_csv('/content/cleaned_balanced_dataset.csv')

le1 = LabelEncoder()
dataset['SubTask1_Num'] = le1.fit_transform(dataset['SubTask1'])

le2 = LabelEncoder()
dataset['SubTask2_Num'] = le2.fit_transform(dataset['SubTask2'])

model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
sbert = SentenceTransformer(model_name)

X = sbert.encode(dataset['Comments'].tolist(), show_progress_bar=True)

y1 = np.array(dataset['SubTask1_Num'].values)
y2 = np.array(dataset['SubTask2_Num'].values)

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

input_dim = X.shape[1]
inputs = Input(shape=(input_dim,))
x = Dense(256, activation='relu')(inputs)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

subtask1_output = Dense(3, activation='softmax', name='subtask1_output')(x)
subtask2_output = Dense(5, activation='softmax', name='subtask2_output')(x)

model = Model(inputs=inputs, outputs=[subtask1_output, subtask2_output])
model.compile(optimizer=Adam(learning_rate=1e-4),
              loss={'subtask1_output': 'sparse_categorical_crossentropy',
                    'subtask2_output': 'sparse_categorical_crossentropy'},
              metrics={'subtask1_output': 'accuracy',
                       'subtask2_output': 'accuracy'})

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X_train,
          {'subtask1_output': y1_train, 'subtask2_output': y2_train},
          validation_split=0.2,
          epochs=50,
          batch_size=32,
          callbacks=[early_stop])

results = model.evaluate(X_test, {'subtask1_output': y1_test, 'subtask2_output': y2_test})
print(f'Subtask 1 - Loss: {results[1]}, Accuracy: {results[2]}')
print(f'Subtask 2 - Loss: {results[3]}, Accuracy: {results[4]}')


Generating sentence embeddings...


Batches:   0%|          | 0/670 [00:00<?, ?it/s]

Epoch 1/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 2.5664 - subtask1_output_accuracy: 0.3602 - subtask1_output_loss: 1.0985 - subtask2_output_accuracy: 0.4247 - subtask2_output_loss: 1.4678 - val_loss: 2.3654 - val_subtask1_output_accuracy: 0.5018 - val_subtask1_output_loss: 1.0317 - val_subtask2_output_accuracy: 0.4825 - val_subtask2_output_loss: 1.3312
Epoch 2/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 2.4084 - subtask1_output_accuracy: 0.4649 - subtask1_output_loss: 1.0377 - subtask2_output_accuracy: 0.4554 - subtask2_output_loss: 1.3707 - val_loss: 2.2658 - val_subtask1_output_accuracy: 0.5315 - val_subtask1_output_loss: 0.9862 - val_subtask2_output_accuracy: 0.5032 - val_subtask2_output_loss: 1.2772
Epoch 3/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 2.3042 - subtask1_output_accuracy: 0.5105 - subtask1_output_loss: 0.9958 - subtask2_output_accuracy: 0.482

In [None]:
from sklearn.metrics import classification_report

y1_pred = np.argmax(model.predict(X_test)[0], axis=1)
y2_pred = np.argmax(model.predict(X_test)[1], axis=1)


print("Classification Report for SubTask 1:")
print(classification_report(y1_test, y1_pred, target_names=le1.classes_))

print("\nClassification Report for SubTask 2:")
print(classification_report(y2_test, y2_pred, target_names=le2.classes_))


[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Classification Report for SubTask 1:
              precision    recall  f1-score   support

     Neutral       0.69      0.67      0.68      1454
  Optimistic       0.79      0.88      0.84      1408
 Pessimistic       0.76      0.70      0.73      1422

    accuracy                           0.75      4284
   macro avg       0.75      0.75      0.75      4284
weighted avg       0.75      0.75      0.75      4284


Classification Report for SubTask 2:
              precision    recall  f1-score   support

Appreciation       0.90      0.84      0.87       415
   Criticism       0.63      0.54      0.58       817
     Nothing       0.71      0.84      0.77      1933
   Offensive       0.74      0.50      0.59       605
  Suggestion       0.80      0.79      0.79       514

    accuracy                           0.73      4284
   macro avg  

Bilstm

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('/content/cleaned_balanced_dataset.csv')

le1 = LabelEncoder()
dataset['SubTask1_Num'] = le1.fit_transform(dataset['SubTask1'])

le2 = LabelEncoder()
dataset['SubTask2_Num'] = le2.fit_transform(dataset['SubTask2'])

default_subtask2_label = le2.transform(['Nothing'])[0]

model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
sbert = SentenceTransformer(model_name)

max_len = 50
embed_dim = 384

tokenized_comments = [word_tokenize(comment.lower()) for comment in dataset['Comments']]
X_sequences = [sbert.encode(tokens, show_progress_bar=False, batch_size=32) for tokens in tokenized_comments]
X_padded = pad_sequences(X_sequences, maxlen=max_len, dtype='float32', padding='post', truncating='post')
X = np.array(X_padded)

y1 = np.array(dataset['SubTask1_Num'].values)
y2 = np.array(dataset['SubTask2_Num'].values)

X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(
    X, y1, y2, test_size=0.2, random_state=42
)

class_weights1 = compute_class_weight('balanced', classes=np.unique(y1_train), y=y1_train)
class_weight_dict1 = dict(enumerate(class_weights1))

class_weights2 = {}
for label in np.unique(y1_train):
    subset_indices = y1_train == label
    y2_subset = y2_train[subset_indices]
    if len(np.unique(y2_subset)) > 1:
        weights = compute_class_weight('balanced', classes=np.unique(y2_subset), y=y2_subset)
        class_weights2[label] = dict(enumerate(weights, start=0))

def create_bilstm_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)))(inputs)
    x = Dropout(0.4)(x)
    x = Bidirectional(LSTM(64, return_sequences=False, kernel_regularizer=l2(0.01)))(x)
    x = Dropout(0.4)(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(0.4)(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

subtask1_model = create_bilstm_model(input_shape=(max_len, embed_dim), num_classes=3)
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

subtask1_model.fit(
    X_train, y1_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    class_weight=class_weight_dict1,
    verbose=1
)

subtask1_train_preds = np.argmax(subtask1_model.predict(X_train), axis=1)
subtask1_train_accuracy = accuracy_score(y1_train, subtask1_train_preds)
print(f"SubTask1 Training Accuracy: {subtask1_train_accuracy:.4f}")

subtask2_models = {}
subtask2_train_accuracies = {}
for label in np.unique(y1_train):
    subset_indices = y1_train == label
    y2_subset = y2_train[subset_indices]
    if len(np.unique(y2_subset)) > 1:
        subtask2_model = create_bilstm_model(input_shape=(max_len, embed_dim), num_classes=5)
        subtask2_model.fit(
            X_train[subset_indices], y2_subset,
            validation_split=0.2,
            epochs=50,
            batch_size=32,
            callbacks=[early_stop],
            class_weight=class_weights2.get(label, None),
            verbose=1
        )
        subtask2_models[label] = subtask2_model
        subtask2_train_preds = np.argmax(subtask2_model.predict(X_train[subset_indices]), axis=1)
        subtask2_train_accuracies[label] = accuracy_score(y2_subset, subtask2_train_preds)
    else:
        subtask2_train_accuracies[label] = None

for label, acc in subtask2_train_accuracies.items():
    if acc is not None:
        print(f"SubTask2 Training Accuracy (SubTask1 Label {label}): {acc:.4f}")

subtask1_preds = np.argmax(subtask1_model.predict(X_test), axis=1)

subtask2_preds = []
for i, subtask1_pred in enumerate(subtask1_preds):
    if subtask1_pred in subtask2_models:
        subtask2_pred = np.argmax(subtask2_models[subtask1_pred].predict(X_test[i:i+1]), axis=1)[0]
        subtask2_preds.append(subtask2_pred)
    else:
        subtask2_preds.append(default_subtask2_label)

subtask1_test_accuracy = accuracy_score(y1_test, subtask1_preds)
print(f"\nSubTask1 Test Accuracy: {subtask1_test_accuracy:.4f}")
print("SubTask1 Classification Report:")
print(classification_report(y1_test, subtask1_preds, target_names=le1.classes_))

subtask2_test_accuracy = accuracy_score(y2_test, subtask2_preds)
print(f"\nSubTask2 Test Accuracy: {subtask2_test_accuracy:.4f}")
print("SubTask2 Classification Report:")
print(classification_report(y2_test, subtask2_preds, target_names=le2.classes_))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Tokenizing comments and generating embeddings...
Training BiLSTM for SubTask1...
Epoch 1/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 21ms/step - accuracy: 0.3340 - loss: 9.2015 - val_accuracy: 0.4195 - val_loss: 4.4257
Epoch 2/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.3850 - loss: 3.6522 - val_accuracy: 0.3988 - val_loss: 2.0983
Epoch 3/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.3962 - loss: 1.8645 - val_accuracy: 0.3959 - val_loss: 1.4048
Epoch 4/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.4049 - loss: 1.3368 - val_accuracy: 0.3985 - val_loss: 1.2043
Epoch 5/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.3969 - loss: 1.1840 - val_accuracy: 0.3991 - val_loss: 1.1398
Epoch 6/50
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.