<a href="https://colab.research.google.com/github/Pearlkakande/machinelearning/blob/main/Model8finale1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages (uncomment if needed)
!pip install datasets torch wandb sentence-transformers scikit-learn
!pip install torch-scatter torch-sparse torch-cluster torch-geometric -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData, DataLoader
from torch_geometric.nn import GCNConv, GATConv  # and other layers as needed
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import wandb
import numpy as np
import pandas as pd


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
!pip install torch_geometric

GRAPH CONSTRUCTION THEN MODEL

In [4]:
# (If running in Colab, uncomment and install PyG)
# !pip install torch torch_geometric sentence-transformers datasets scikit-learn

import os
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from torch_geometric.data import HeteroData
from torch_geometric.nn import GCNConv, GATConv


In [5]:
from datasets import load_dataset

# Load & preprocess
raw = load_dataset("Eitanli/goodreads")["train"]
df = pd.DataFrame(raw)
df['Description'] = df['Description'].fillna('')
df['Genres']      = df['Genres'].fillna('')
df['combined_features'] = (
    df['Book'] + ' ' + df['Author'] + ' ' +
    df['Description'] + ' ' + df['Genres']
)

# Normalize ratings
df['Num_Ratings'] = (
    df['Num_Ratings'].astype(str)
       .str.replace(',', '')
       .astype(float)
       .fillna(0)
)
df['Avg_Rating'] = pd.to_numeric(df['Avg_Rating'], errors='coerce').fillna(0)

print(f"Dataset ready: {len(df)} books")


Repo card metadata block was not found. Setting CardData to empty.


Dataset ready: 10000 books


In [6]:
# Sentence-Transformer embeddings
st = SentenceTransformer('all-MiniLM-L6-v2')
df['desc_emb']  = list(st.encode(df['Description'].tolist(), show_progress_bar=True))
df['title_emb'] = list(st.encode(df['Book'].tolist(), show_progress_bar=True))
df['genre_emb'] = list(st.encode(df['Genres'].tolist(), show_progress_bar=True))

embeddings = {
    'description': np.stack(df['desc_emb']),
    'title':       np.stack(df['title_emb']),
    'genre':       np.stack(df['genre_emb'])
}
with open('book_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

# TF–IDF + SVD
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
svd   = TruncatedSVD(n_components=100, random_state=42)
pipe  = Pipeline([('tfidf', tfidf), ('svd', svd)])
tfidf_feats = pipe.fit_transform(df['combined_features'])
with open('tfidf_pipeline.pkl', 'wb') as f:
    pickle.dump(pipe, f)
np.save('tfidf_features.npy', tfidf_feats)

print("Saved embeddings & TF–IDF artifacts.")


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Saved embeddings & TF–IDF artifacts.


In [8]:
def build_hetero_graph(df, sim_thresh=0.8):
    data = HeteroData()
    book_emb = np.stack(df['desc_emb'].values)
    data['book'].x = torch.tensor(book_emb, dtype=torch.float)

    # Authors
    authors = df['Author'].unique().tolist()
    a2i = {a: i for i, a in enumerate(authors)}
    data['author'].num_nodes = len(authors)
    data['author'].x = F.one_hot(
        torch.arange(len(authors)), num_classes=len(authors)
    ).float()

    # Genres
    uniq_g = {
        g for s in df['Genres']
           for g in s.replace(',', ' ').split() if g
    }
    genres = sorted(uniq_g)
    g2i    = {g: i for i, g in enumerate(genres)}
    data['genre'].num_nodes = len(genres)
    data['genre'].x = F.one_hot(
        torch.arange(len(genres)), num_classes=len(genres)
    ).float()

    # book→author edges
    ba = [
        (i, a2i[a]) for i, a in enumerate(df['Author'])
        if a in a2i
    ]
    data['book','written_by','author'].edge_index = torch.tensor(ba, dtype=torch.long).t()

    # book→genre edges
    bg = []
    for i, s in enumerate(df['Genres']):
        for g in s.replace(',', ' ').split():
            if g in g2i:
                bg.append((i, g2i[g]))
    data['book','has_genre','genre'].edge_index = torch.tensor(bg, dtype=torch.long).t()

    # book↔book similarity
    sim = cosine_similarity(book_emb)
    i,j = np.where((sim>sim_thresh)&(np.arange(len(df))[:,None]!=np.arange(len(df))))
    data['book','similar_to','book'].edge_index = torch.tensor([i,j],dtype=torch.long)

    # Ratings
    data['book'].ratings_count = torch.tensor(df['Num_Ratings'].values, dtype=torch.float)
    data['book'].avg_rating     = torch.tensor(df['Avg_Rating'].values, dtype=torch.float)

    return data, df

graph_data, proc_df = build_hetero_graph(df)
proc_df.to_csv('books_data.csv', index=False)
with open('book_graph.pkl','wb') as f:
    pickle.dump(graph_data, f)
print("Graph built and saved.")


Graph built and saved.


In [9]:
class GNNRecommender(torch.nn.Module):
    def __init__(self, in_ch, hidden_ch):
        super().__init__()
        # **Hidden = in_ch (384) so GNN outputs 384 dims**
        self.book_encoder = torch.nn.Sequential(
            GCNConv(in_ch, hidden_ch),
            torch.nn.ReLU(),
            GCNConv(hidden_ch, hidden_ch),
        )
    def encode_books(self, data):
        x  = data['book'].x
        ei = data['book','similar_to','book'].edge_index
        x  = F.relu(self.book_encoder[0](x, ei))
        return self.book_encoder[2](x, ei)

def train_gnn(data, epochs=5):
    ic = data['book'].x.size(1)  # should be 384
    model = GNNRecommender(ic, ic)
    opt   = torch.optim.Adam(model.parameters(), lr=0.01)
    ei    = data['book','similar_to','book'].edge_index

    for e in range(epochs):
        model.train(); opt.zero_grad()
        emb = model.encode_books(data)
        src,dst = ei
        pos = (emb[src]*emb[dst]).sum(dim=1)
        neg_dst = dst[torch.randperm(len(dst))]
        neg = (emb[src]*emb[neg_dst]).sum(dim=1)
        loss = -torch.log(torch.sigmoid(pos)).mean() \
               -torch.log(1-torch.sigmoid(neg)).mean()
        loss.backward(); opt.step()
        print(f"Epoch {e+1}/{epochs} — loss: {loss:.4f}")

    return model

gnn_model = train_gnn(graph_data, epochs=5)
torch.save(gnn_model.state_dict(), 'gnn_model.pt')
print("GNN model saved (384→384).")


Epoch 1/5 — loss: 1.3637
Epoch 2/5 — loss: inf
Epoch 3/5 — loss: nan
Epoch 4/5 — loss: nan
Epoch 5/5 — loss: nan
GNN model saved (384→384).


In [11]:
from torch_geometric.nn import GCNConv  # needed by module import
# Reload everything for inference
df     = pd.read_csv('books_data.csv')
emb    = pickle.load(open('book_embeddings.pkl','rb'))
tfidf_p= pickle.load(open('tfidf_pipeline.pkl','rb'))
tfidf_f= np.load('tfidf_features.npy')
graph  = pickle.load(open('book_graph.pkl','rb'))

# Rebuild model architecture
model = GNNRecommender(graph['book'].x.size(1), graph['book'].x.size(1))
model.load_state_dict(torch.load('gnn_model.pt'))
model.eval()

def recommend_with_gnn(genre, desc, model, data, df, top_n=5):
    st  = SentenceTransformer('all-MiniLM-L6-v2')
    q384 = st.encode([f"{genre} {desc}"])[0]
    be   = model.encode_books(data).detach().numpy()
    # Check for and replace NaN values in 'be' with 0
    be = np.nan_to_num(be)  # Replace NaN with 0

    sims = cosine_similarity([q384], be)[0]
    idx  = sims.argsort()[-top_n:][::-1]
    recs = df.iloc[idx].copy()
    recs['score'] = sims[idx]
    return recs[['Book','Author','Genres','score']]

# Run it:
res = recommend_with_gnn(
    "Science Fiction",
    "space exploration and alien civilizations",
    model, graph, df, top_n=5
)
print(res)


                         Book             Author  \
16            The Kite Runner    Khaled Hosseini   
17  The Giver (The Giver, #1)         Lois Lowry   
18            The Giving Tree   Shel Silverstein   
19            Charlotte's Web         E.B. White   
20               Little Women  Louisa May Alcott   

                                               Genres  score  
16  ['Fiction', 'Historical Fiction', 'Classics', ...    0.0  
17  ['Young Adult', 'Fiction', 'Classics', 'Dystop...    0.0  
18  ['Childrens', 'Fiction', 'Picture Books', 'Cla...    0.0  
19  ['Classics', 'Fiction', 'Childrens', 'Fantasy'...    0.0  
20  ['Classics', 'Fiction', 'Historical Fiction', ...    0.0  


In [12]:
# 1. Make sure Drive is mounted (you only need to run this once per session)
# from google.colab import drive
# drive.mount('/content/drive')

# 2. List all the files you want to preserve
artifacts = [
    'book_embeddings.pkl',
    'tfidf_pipeline.pkl',
    'tfidf_features.npy',
    'books_data.csv',
    'book_graph.pkl',
    'gnn_model.pt',
    'hybrid_recommender.py',  # if you generated it
]

# 3. Destination folder in your Drive
dst_folder = '/content/drive/MyDrive/ColabArtifacts'
import os
os.makedirs(dst_folder, exist_ok=True)

# 4. Copy them over
import shutil
for fname in artifacts:
    if os.path.exists(fname):
        shutil.copy(fname, dst_folder)
    else:
        print(f"⚠️ File not found, skipping: {fname}")

print(f"✅ Artifacts copied to {dst_folder}")


⚠️ File not found, skipping: hybrid_recommender.py
✅ Artifacts copied to /content/drive/MyDrive/ColabArtifacts
