In [None]:
pip install transformers



In [None]:
pip install SentencePiece



In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_fscore_support
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from transformers import RobertaTokenizer, RobertaModel
import torch


In [None]:
url = 'https://drive.google.com/uc?id=12M_H4oziPEU5V0ee46wMbLvIPuEHj1HK'
data = pd.read_csv(url)
data = data.rename(columns={'human_label1': 'feelLonely'}).drop('human_label2', axis=1)
X = data['text']
y = data['feelLonely']

In [None]:
# Initialize the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Initialize the RoBERTa model
model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)

X_embeddings = []

max_length = 128

for text in X:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    embeddings = outputs.hidden_states[-1].detach().numpy()
    X_embeddings.append(embeddings.mean(axis=1))

X_embeddings = np.vstack(X_embeddings)

# Apply Latent Semantic Analysis (LSA) for dimensionality reduction
n_components = 100  # Adjust the number of components as needed
lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_lsa = lsa.fit_transform(X_embeddings)  # Corrected variable name

# Data Augmentation using SMOTE
smote = SMOTE(random_state=42)
X_augmented, y_augmented = smote.fit_resample(X_lsa, y)

# Split the augmented data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (100, 50), (150, 75)],
    'alpha': [0.0001, 0.001, 0.01],
}

mlp_classifier = MLPClassifier(max_iter=500, random_state=42)

grid_search = GridSearchCV(mlp_classifier, param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_hidden_layer_sizes = grid_search.best_params_['hidden_layer_sizes']
best_alpha = grid_search.best_params_['alpha']

mlp_classifier = MLPClassifier(hidden_layer_sizes=best_hidden_layer_sizes, alpha=best_alpha, max_iter=500, random_state=42)

voting_classifier = VotingClassifier(estimators=[('mlp', mlp_classifier)], voting='hard')

voting_classifier.fit(X_train, y_train)

y_pred = voting_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred)

print("Test Set Accuracy:", accuracy)
print("Test Set F1 Score:", f1)
print("Test Set Precision:", precision[1])
print("Test Set Recall:", recall[1])
print("Classification Report:\n", classification_report(y_test, y_pred))




Test Set Accuracy: 0.8361650485436893
Test Set F1 Score: 0.8371531966224367
Test Set Precision: 0.8164705882352942
Test Set Recall: 0.8589108910891089
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.81      0.84       840
         1.0       0.82      0.86      0.84       808

    accuracy                           0.84      1648
   macro avg       0.84      0.84      0.84      1648
weighted avg       0.84      0.84      0.84      1648

