<a href="https://colab.research.google.com/github/Pearlkakande/machinelearning/blob/main/MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries if needed:
!pip install datasets scikit-learn

import pandas as pd, numpy as np, torch, torch.nn as nn, torch.optim as optim, time
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
dataset = load_dataset("Eitanli/goodreads", split="train")
df = pd.DataFrame(dataset)
print("Dataset shape:", df.shape)

# Preprocess: Use book descriptions (convert to string) and compute TF-IDF features
df['Description'] = df['Description'].astype(str)
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_features = vectorizer.fit_transform(df['Description']).toarray()
features = torch.tensor(tfidf_features, dtype=torch.float32, device=device)

# Use aggregated average rating as target (regression task)
ratings = torch.tensor(df['Avg_Rating'].values, dtype=torch.float32, device=device)

# Split indices for training and testing
indices = np.arange(len(df))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)
train_idx = torch.tensor(train_idx, dtype=torch.long, device=device)
test_idx = torch.tensor(test_idx, dtype=torch.long, device=device)

# Define a simple linear regression model
class LinearMF(nn.Module):
    def __init__(self, input_dim):
        super(LinearMF, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return self.linear(x).squeeze()

model1 = LinearMF(input_dim=features.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model1.parameters(), lr=0.001)

num_epochs = 30
train_losses, test_losses, epoch_times = [], [], []

for epoch in range(num_epochs):
    start_time = time.time()
    model1.train()
    optimizer.zero_grad()
    pred = model1(features)
    loss = criterion(pred[train_idx], ratings[train_idx])
    loss.backward()
    optimizer.step()
    epoch_time = time.time() - start_time

    model1.eval()
    with torch.no_grad():
        test_loss = criterion(model1(features)[test_idx], ratings[test_idx]).item()

    train_losses.append(loss.item())
    test_losses.append(test_loss)
    epoch_times.append(epoch_time)
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {loss.item():.4f}, Test Loss = {test_loss:.4f}, Epoch Time = {epoch_time:.2f} sec, LR = {optimizer.param_groups[0]['lr']}")

# Plot losses
plt.figure()
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Linear MF Baseline Loss over Epochs")
plt.legend()
plt.show()

print("Final Test MSE:", test_losses[-1])
