In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('/content/df_train_full.csv')
df_test = pd.read_csv('/content/df_test_clean.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)


In [None]:
df_train.head()

In [None]:
df_train['hasImage'].value_counts()

In [None]:
df_train_num = df_train.select_dtypes(include='number')


In [None]:
df_train_num.corr()

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.to(device)
model.eval()

def extract_embeddings(df):
    texts = df['clean_title'].tolist()
    batch_size = 16
    features = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        encoded = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**encoded)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        features.append(cls_embeddings)

    return np.vstack(features)

X_train_bert = extract_embeddings(df_train)
X_test_bert = extract_embeddings(df_test)

In [None]:
import numpy as np
import os
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define the folder path
folder_path = '/content/drive/MyDrive/multimodel_dataset_extracted'

# 3. Define the full file paths
train_save_path = os.path.join(folder_path, 'X_text_emb_train.npy')
test_save_path = os.path.join(folder_path, 'X_text_emb_test.npy')

# 4. Save the NumPy arrays
# Assuming X_train_bert and X_test_bert are your embedding arrays
np.save(train_save_path, X_train_bert)
np.save(test_save_path, X_test_bert)

print(f"✅ Training embeddings saved successfully to: {train_save_path}")
print(f"✅ Testing embeddings saved successfully to: {test_save_path}")

In [None]:
cols_delete = ['clean_title','author','subreddit','domain', 'created_utc', 'hasImage', 'id', 'image_url', 'linked_submission_id', 'title']
df_train_mod = df_train.drop(columns=cols_delete)
df_test_mod = df_test.drop(columns=cols_delete)

In [None]:
df_test_mod = df_test.drop(columns=cols_delete)
df_train_mod.head()

In [None]:
df_test_mod.iloc[:,:3]

In [None]:
from scipy.stats import skew

print("Skewness:", skew(df_train_mod['num_comments'].dropna()))


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. Apply log1p to reduce skewness
df_train_mod['num_comments_log'] = np.log1p(df_train_mod['num_comments'])

# 2. (Optional) Scale the log-transformed data
scaler = StandardScaler()
df_train_mod['num_comments_scaled'] = scaler.fit_transform(df_train_mod[['num_comments_log']])



In [None]:
df_test_mod['num_comments_log'] = np.log1p(df_test_mod['num_comments'])
df_test_mod['num_comments_scaled'] = scaler.transform(df_test_mod[['num_comments_log']])

In [None]:
from scipy.stats import skew

print("Skewness:", skew(df_train_mod['score'].dropna()))


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. Apply log1p to reduce skewness
df_train_mod['score_log'] = np.log1p(df_train_mod['score'])

# 2. (Optional) Scale the log-transformed data
scaler = StandardScaler()
df_train_mod['score_scaled'] = scaler.fit_transform(df_train_mod[['score_log']])



In [None]:
f_test_mod['score_log'] = np.log1p(df_test_mod['score'])

# 2. (Optional) Scale the log-transformed data
scaler = StandardScaler()
df_test_mod['num_comments_scaled'] = scaler.fit_transform(df_test_mod[['score_log']])

In [None]:
df_test_mod['score_scaled'] = scaler.transform(df_test_mod[['score']])


In [None]:
df_test_mod = df_test_mod.drop(columns=['score', 'num_comments'])
df_train_mod = df_train_mod.drop(columns=['score', 'num_comments'])

In [None]:
df_train_mod.head()