In [None]:
!pip install pandas scikit-learn sentence-transformers




In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Load dataset
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

# Create dataframe
df = pd.DataFrame({
    "title": [f"Movie_{i}" for i in range(len(newsgroups.data))],
    "overview": newsgroups.data
})

df.head()
print(df)

             title                                           overview
0          Movie_0  I was wondering if anyone out there could enli...
1          Movie_1  A fair number of brave souls who upgraded thei...
2          Movie_2  well folks, my mac plus finally gave up the gh...
3          Movie_3  \nDo you have Weitek's address/phone number?  ...
4          Movie_4  From article <C5owCB.n3p@world.std.com>, by to...
...            ...                                                ...
11309  Movie_11309  DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...
11310  Movie_11310  I have a (very old) Mac 512k and a Mac Plus, b...
11311  Movie_11311  I just installed a DX2-66 CPU in a clone mothe...
11312  Movie_11312  \nWouldn't this require a hyper-sphere.  In 3-...
11313  Movie_11313  Stolen from Pasadena between 4:30 and 6:30 pm ...

[11314 rows x 2 columns]


In [None]:
df["overview"] = df["overview"].fillna("")


In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)

tfidf_matrix = vectorizer.fit_transform(df["overview"])


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix)


In [None]:
def recommend_movies(title, top_n=5):

    indices = pd.Series(df.index, index=df["title"])

    if title not in indices:
        return "Movie not found!"

    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]

    return df["title"].iloc[movie_indices]


In [None]:
recommend_movies("Movie_10")


Unnamed: 0,title
4211,Movie_4211
2951,Movie_2951
3543,Movie_3543
5811,Movie_5811
3386,Movie_3386


FULL CODE

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load built-in dataset
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

df = pd.DataFrame({
    "title": [f"Movie_{i}" for i in range(len(newsgroups.data))],
    "overview": newsgroups.data
})

df["overview"] = df["overview"].fillna("")

# Convert text to TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["overview"])

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Recommendation function
def recommend_movies(title, top_n=5):
    indices = pd.Series(df.index, index=df["title"])

    if title not in indices:
        return "Movie not found!"

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]

# Test
print(recommend_movies("Movie_10"))


4211    Movie_4211
2951    Movie_2951
3543    Movie_3543
5811    Movie_5811
3386    Movie_3386
Name: title, dtype: object


In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset WITH labels
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

df = pd.DataFrame({
    "title": [f"Movie_{i}" for i in range(len(newsgroups.data))],
    "overview": newsgroups.data,
    "label": newsgroups.target
})

# Clean text
df["overview"] = df["overview"].fillna("")

# TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["overview"])

# Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix)

# Recommendation function
def recommend_movies(index, top_n=5):
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    return [i[0] for i in sim_scores]

# -------------------------------
# EVALUATION
# -------------------------------

def evaluate_model(sample_size=500, top_n=5):
    correct = 0
    total = 0

    for i in range(sample_size):
        recommended = recommend_movies(i, top_n)

        original_label = df.iloc[i]["label"]

        for rec in recommended:
            if df.iloc[rec]["label"] == original_label:
                correct += 1
            total += 1

    accuracy = correct / total
    return accuracy

accuracy = evaluate_model()
print("Recommendation Accuracy:", round(accuracy * 100, 2), "%")


Recommendation Accuracy: 47.56 %


WITH FINETUNING


In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn




In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate


In [None]:
dataset = load_dataset("ag_news")   # cleaner & faster than 20news

dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=4
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": metric.compute(predictions=predictions, references=labels)}


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(5000)),
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(2000)),
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.32651,{'accuracy': 0.896}
2,0.362261,0.312254,{'accuracy': 0.9055}
3,0.362261,0.332209,{'accuracy': 0.907}




Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



TrainOutput(global_step=939, training_loss=0.2811406372200313, metrics={'train_runtime': 11685.7905, 'train_samples_per_second': 1.284, 'train_steps_per_second': 0.08, 'total_flos': 501009570668544.0, 'train_loss': 0.2811406372200313, 'epoch': 3.0})

In [None]:
from torch.nn.functional import normalize

def get_embeddings(text_list):
    inputs = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model.distilbert(**inputs)
        embeddings = outputs.last_hidden_state[:,0,:]  # CLS token

    return normalize(embeddings, p=2, dim=1)


In [None]:
train_texts = dataset["train"]["text"][:1000]

embeddings = get_embeddings(train_texts)

def recommend(text, top_n=5):
    input_embedding = get_embeddings([text])
    similarity = torch.matmul(input_embedding, embeddings.T)

    top_indices = torch.topk(similarity, top_n).indices[0]

    for idx in top_indices:
        print("----")
        print(train_texts[idx])


In [None]:
recommend("Apple releases new iPhone with advanced AI features")


----
Apple Ships Motion Apple has begun shipping Motion, which delivers high-performance, real-time motion graphics design and integration with Final Cut Pro HD and DVD Studio Pro 3, at a breakthrough price of \$299. Aug 10
----
Microsoft wants to improve your image New imaging software is making eyes at those squinty camera-phone pictures.
----
Apple puts edit tools in one basket Bundle of professional video editing applications includes new Motion special-effects software.
----
Microsoft ships updated Works Version 8 of the low-priced productivity package includes a stand-alone dictionary and a PowerPoint add-on.
----
Sprint Puts Streaming Media on Phones PCS Vision Multimedia streams faster video plus audio channels to Samsung phone.


In [1]:
import gradio as gr
import torch

# Assuming get_embeddings, embeddings (from train_texts), and train_texts are defined from previous cells.
# get_embeddings uses the fine-tuned model and tokenizer from previous cells.
# `embeddings` refers to the embeddings of `train_texts` (1000 articles from ag_news).

def recommend_movie(query, top_n=5):
    if not isinstance(query, str):
        return "Please enter a text query."

    # Generate embedding for the input query
    input_embedding = get_embeddings([query])

    # Calculate cosine similarity with existing embeddings
    similarity = torch.matmul(input_embedding, embeddings.T)

    # Get the indices of the top_n most similar items
    top_indices = torch.topk(similarity, top_n).indices[0]

    # Retrieve the recommended texts
    recommended_items = []
    for idx in top_indices:
        recommended_items.append(f"----\n{train_texts[idx]}")

    return "\n".join(recommended_items)


interface = gr.Interface(
    fn=recommend_movie,
    inputs=gr.Textbox(lines=3, placeholder="Describe a movie or theme..."),
    outputs=gr.Textbox(),
    title="News Article Recommender App",
    description="Enter a description or theme and get top 5 recommended news articles!"
)

interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aae75b271df57f7767.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


