In [3]:
import pandas as pd
merged_df = pd.read_csv('/kaggle/input/dataframe/merged_df.csv')

<h2> This Takes very Long to complete - Lack of compute resources 

In [2]:
# Fill NaN values in text columns with an empty string
text_columns = ['title_x', 'text', 'title_y', 'details']
merged_df[text_columns] = merged_df[text_columns].fillna('')

# Check for remaining missing values and handle them
merged_df.fillna(0, inplace=True)

# Now all columns should have the same number of non-null values
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484007 entries, 0 to 484006
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   rating             484007 non-null  float64
 1   title_x            484007 non-null  object 
 2   text               484007 non-null  object 
 3   helpful_vote       484007 non-null  int64  
 4   verified_purchase  484007 non-null  bool   
 5   title_y            484007 non-null  object 
 6   average_rating     484007 non-null  float64
 7   rating_number      484007 non-null  int64  
 8   price              484007 non-null  float64
 9   details            484007 non-null  object 
 10  x_length           484007 non-null  int64  
 11  y_length           484007 non-null  int64  
 12  de_length          484007 non-null  int64  
 13  review_length      484007 non-null  int64  
dtypes: bool(1), float64(3), int64(6), object(4)
memory usage: 48.5+ MB
None


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Assuming merged_df is already defined and contains the necessary columns

# Select a subset of the data
subset_df = merged_df.sample(n=10000, random_state=42)

# Combine text columns into one for BERT embeddings
subset_df['combined_text'] = subset_df['text'] + ' ' + subset_df['title_x'] + ' ' + subset_df['title_y'] + ' ' + subset_df['details']

# Normalize numerical features
numerical_features = subset_df[['price', 'average_rating', 'rating_number', 'helpful_vote', 'x_length', 'y_length', 'de_length', 'review_length']]
scaler = StandardScaler()
normalized_numerical_features = scaler.fit_transform(numerical_features)

# Define the target variable
y = subset_df['rating'].values

# Split the data into training and testing sets
text_train, text_test, num_train, num_test, y_train, y_test = train_test_split(
    subset_df['combined_text'].values, normalized_numerical_features, y, test_size=0.2, random_state=42
)

In [5]:
class ReviewDataset(Dataset):
    def __init__(self, texts, numerical_features, targets, tokenizer, max_len):
        self.texts = texts
        self.numerical_features = numerical_features
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        numerical_features = self.numerical_features[index]
        target = self.targets[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'numerical_features': torch.tensor(numerical_features, dtype=torch.float),
            'target': torch.tensor(target, dtype=torch.float)
        }

In [6]:
class TransformerRegressor(nn.Module):
    def __init__(self, transformer_model_name, n_numerical_features):
        super(TransformerRegressor, self).__init__()
        self.transformer = BertModel.from_pretrained(transformer_model_name)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.transformer.config.hidden_size + n_numerical_features, 1)

    def forward(self, input_ids, attention_mask, numerical_features):
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = transformer_outputs[1]
        combined = torch.cat((pooled_output, numerical_features), dim=1)
        output = self.out(self.drop(combined))
        return output


In [7]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        targets = batch['target'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, numerical_features=numerical_features)
        loss = loss_fn(outputs, targets.unsqueeze(1))
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    return np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerical_features = batch['numerical_features'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, numerical_features=numerical_features)
            loss = loss_fn(outputs, targets.unsqueeze(1))
            losses.append(loss.item())
            preds.append(outputs.cpu().numpy())

    return np.mean(losses), np.concatenate(preds, axis=0)


In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create PyTorch datasets and dataloaders
train_dataset = ReviewDataset(text_train, num_train, y_train, tokenizer, max_len=512)
test_dataset = ReviewDataset(text_test, num_test, y_test, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the model, optimizer, and scheduler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerRegressor(transformer_model_name='bert-base-uncased', n_numerical_features=num_train.shape[1])
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.MSELoss().to(device)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler, len(train_dataset))
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}')

# Evaluation
test_loss, preds = eval_model(model, test_loader, loss_fn, device, len(test_dataset))
print(f'Test Loss: {test_loss:.4f}')

# Calculate and print the Mean Squared Error
y_pred = preds.flatten()
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

