<a href="https://colab.research.google.com/github/UBGidado/My_Research/blob/main/SocialIQa_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers torch pandas numpy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Loading the Dataset

In [None]:
from datasets import load_dataset

# Load the Social IQa dataset using the correct identifier
social_iqa = load_dataset("allenai/social_i_qa")

In [None]:
# Inspect Dataset Structure
print(social_iqa["train"].features)

{'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answerA': Value(dtype='string', id=None), 'answerB': Value(dtype='string', id=None), 'answerC': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}


In [None]:
import pandas as pd
social_iqa.column_names

{'train': ['context', 'question', 'answerA', 'answerB', 'answerC', 'label'],
 'validation': ['context',
  'question',
  'answerA',
  'answerB',
  'answerC',
  'label']}

In [None]:
import os

# Set the save path in your Drive
save_path = '/content/drive/MyDrive/social_iqa_data'  # Change 'social_iqa_data' to your desired folder name
os.makedirs(save_path, exist_ok=True)  # Create the folder if it doesn't exist

# Load the Social IQa dataset using the correct identifier
try:
    social_iqa = load_dataset("social_i_qa")  # Corrected dataset name to 'social_i_qa'
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please check the dataset name and your internet connection.")
    exit()

### Feature Extraction

In [None]:
# The dataset contains question-answer pairs about social scenarios
df = pd.DataFrame(social_iqa["train"])
df = df[['context', 'question', 'answerA', 'answerB', 'answerC']]

# Step 4: Convert Text to Embeddings (BERT/RoBERTa)
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# Function to extract sentence embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# Compute embeddings for the context of each question
df["context_embedding"] = df["context"].apply(get_embedding)

# Convert embeddings to a format that can be used in the graph
context_embeddings = np.vstack(df["context_embedding"].values)
np.save("socialiqa_embeddings.npy", context_embeddings)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Save Processed Data to Drive
df.drop(columns=["context_embedding"]).to_csv(os.path.join(save_path, "socialiqa_cleaned.csv"), index=False)
np.save(os.path.join(save_path, "socialiqa_embeddings.npy"), context_embeddings)

print(f"✅ Preprocessing Complete: 'socialiqa_cleaned.csv' and 'socialiqa_embeddings.npy' saved to {save_path}!")

✅ Preprocessing Complete: 'socialiqa_cleaned.csv' and 'socialiqa_embeddings.npy' saved to /content/drive/MyDrive/social_iqa_data!
