In [1]:
import os
os.environ['WANDB_MODE'] = 'disabled'
import warnings
warnings.filterwarnings('ignore')

!pip install --upgrade transformers sentence-transformers huggingface-hub
!pip install langchain langchain-huggingface
!pip install chromadb tiktoken
!pip install langchain-community
!pip install datasets

Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━

**Loading Libraries**

In [2]:
import pandas as pd
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import CSVLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

# Print library versions to confirm
import transformers
print(f"Transformers version: {transformers.__version__}")
import sentence_transformers
print(f"SentenceTransformers version: {sentence_transformers.__version__}")
import huggingface_hub
print(f"HuggingFace Hub version: {huggingface_hub.__version__}")
import langchain
print(f"LangChain version: {langchain.__version__}")

Transformers version: 4.46.3
SentenceTransformers version: 3.3.1
HuggingFace Hub version: 0.26.3
LangChain version: 0.3.9


**Data Preprocessing**

In [3]:
anime = pd. read_csv('anime_with_synopsis.csv')
anime.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [4]:
if 'sypnopsis' in anime.columns:
    anime.rename(columns={'sypnopsis': 'synopsis'}, inplace=True)


anime = anime.dropna(subset=['Name', 'synopsis', 'Genres'])


anime = anime[~anime['synopsis'].str.contains("No synopsis information", na=False)]

In [5]:
# Combine the information
anime['combined_info'] = anime.apply(
    lambda row: f"Title: {row['Name']}. Overview: {row['synopsis']} Genres: {row['Genres']}",
    axis=1
)
anime['combined_info'][0]

'Title: Cowboy Bebop. Overview: In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member\'s dark and mysterious past little by little. Well-balanced with high density action and light-hearted comedy, Cowboy Bebo

In [6]:
#Save processed dataset - combined_info for Langchain
anime[['combined_info']].to_csv('anime_updated.csv', index=False)

In [7]:
processed_anime = pd.read_csv('anime_updated.csv')
processed_anime.head()

Unnamed: 0,combined_info
0,Title: Cowboy Bebop. Overview: In the year 207...
1,Title: Cowboy Bebop: Tengoku no Tobira. Overvi...
2,Title: Trigun. Overview: Vash the Stampede is ...
3,Title: Witch Hunter Robin. Overview: ches are ...
4,Title: Bouken Ou Beet. Overview: It is the dar...


**Fine-Tune the Embeddings Model**

In [8]:
from sentence_transformers import InputExample

# Load a pre-trained SentenceTransformer model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)

# Prepare the data for unsupervised SimCSE fine-tuning
train_sentences = anime['combined_info'].tolist()

# Create InputExample instances with two identical sentences
train_examples = [InputExample(texts=[sent, sent]) for sent in train_sentences]

# Create a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Use the MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(embedding_model)

# Fine-tune the model using SimCSE approach
embedding_model.train()  # Activate training mode to enable dropout
embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    show_progress_bar=True
)

# Save the fine-tuned model to a directory
embedding_model.save('fine_tuned_model')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



Step,Training Loss
500,0.0001
1000,0.0
1500,0.0
2000,0.0
2500,0.0


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

**Create Embeddings and Vector Store**

In [9]:
# Use the fine-tuned model by specifying the path
embeddings = HuggingFaceEmbeddings(model_name='fine_tuned_model')

# Load the data using LangChain's CSVLoader
loader = CSVLoader(file_path="anime_updated.csv")
data = loader.load()

# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

# Create a vector store using Chroma
docsearch = Chroma.from_documents(texts, embeddings)


In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline

# Load the Flan-T5 model and tokenizer
model_name = 'google/flan-t5-large'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create a pipeline for text generation
hf_pipeline = pipeline(
    'text2text-generation',
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.9,  # Increased temperature for more creativity
    top_p=0.95,       # Increased top_p to allow more diverse tokens
    repetition_penalty=1.1,
    do_sample=True
)

# Wrap the pipeline in a LangChain LLM
local_llm = HuggingFacePipeline(pipeline=hf_pipeline)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  local_llm = HuggingFacePipeline(pipeline=hf_pipeline)


**Set Up the RetrievalQA Chain**

In [11]:
# Define the prompt template
template = """You are an anime recommender system that helps users find anime that match their preferences.
Use the following context to answer the question at the end.
For each recommendation, suggest three anime films with a short description of the plot and why the user might like them.
If you don't know the answer, say that you don't know; don't try to make up an answer.

{context}

Question: {question}
Your response:"""

PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

# Define the retriever with increased k
retriever = docsearch.as_retriever(search_kwargs={"k": 10})

# Set up the RetrievalQA chain with the local LLM
qa = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
    verbose=True  # Enable verbose logging
)

In [24]:
query = "I'm looking for a dark fantasy anime where man eating titans are involved . What could you suggest to me?"
result = qa.invoke({"query": query})  # Updated method call
print(result['result'])  # Print the recommendations



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Shingeki no Kyojin


In [25]:
# Print the retrieved documents
print("Retrieved Documents:")
for doc in result['source_documents']:
    print(doc.page_content)
    print("\n---\n")

Retrieved Documents:
combined_info: Title: Ankoku Shindenshou Takegami. Overview: o of diabolical dragons stirs from eons of slumber deep within the earth. Charging to the surface, they launch a bloody rampage against humanity, feasting on the weak and gaining strength from their hapless victims' souls. It will take the power of ancient god Takegami, the Guardian of Darkness, to vanquish the evil serpent forces. But alas, the mortal body Takegami inhabits belongs to a host reluctant to fight. Genres: Action, Mecha, Supernatural, Drama, Sci-Fi

---

combined_info: Title: Mutant Turtles: Choujin Densetsu-hen. Overview: hen the Teenage Mutant Ninja Turtles acquire Mutastones from Crys-Mu, the spirit of light, they acquire the ability to enhance themselves into Super Turtles for a duration of three minutes. Meanwhile, the evil Shredder and his minions Bebop and Rocksteady stumble upon the Dark Mutastone, which transforms them into Devil Shredder, Supermutant Bebop and Supermutant Rockstead

In [26]:
result['source_documents'][0]

Document(metadata={'row': 1108, 'source': 'anime_updated.csv'}, page_content="combined_info: Title: Ankoku Shindenshou Takegami. Overview: o of diabolical dragons stirs from eons of slumber deep within the earth. Charging to the surface, they launch a bloody rampage against humanity, feasting on the weak and gaining strength from their hapless victims' souls. It will take the power of ancient god Takegami, the Guardian of Darkness, to vanquish the evil serpent forces. But alas, the mortal body Takegami inhabits belongs to a host reluctant to fight. Genres: Action, Mecha, Supernatural, Drama, Sci-Fi")

In [27]:
# Manually construct the prompt
sample_context = " ".join([doc.page_content for doc in result['source_documents']])
test_prompt = f"""You are an anime recommender system that helps users find anime that match their preferences.
Use the following context to answer the question at the end.
For each recommendation, suggest three anime films with a short description of the plot and why the user might like them.
If you don't know the answer, say that you don't know; don't try to make up an answer.

{sample_context}

Question: {query}
Your response:"""

# Generate a response using the local LLM
response = local_llm(test_prompt)
print(response)

Shingeki no Kyojin


**Second Template** - Provinding additional user info in the context

In [28]:
# Define user information
age = 23
gender = 'male'

# Update the prompt to include user info
template_prefix = f"""You are an anime recommender system that helps users find anime that match their preferences.
Use the following context and the user's information to answer the question at the end.
If you don't know the answer, say that you don't know; don't try to make up an answer.

User Information:
- Age: {age}
- Gender: {gender}

{{context}}"""

template_suffix = """
Question: {question}
Your response:"""

COMBINED_PROMPT = template_prefix + template_suffix

PROMPT = PromptTemplate(template=COMBINED_PROMPT, input_variables=["context", "question"])

# Define the retriever with increased k
retriever = docsearch.as_retriever(search_kwargs={"k": 10})

# Update the RetrievalQA chain with the local LLM
qa = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
    verbose=True  # Optionally enable verbose logging
)

# Get the personalized recommendations
result = qa.invoke({'query': query})

# Print the recommendations
print(result['result'])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Shingeki no Kyojin


In [29]:
result['source_documents']

[Document(metadata={'row': 1108, 'source': 'anime_updated.csv'}, page_content="combined_info: Title: Ankoku Shindenshou Takegami. Overview: o of diabolical dragons stirs from eons of slumber deep within the earth. Charging to the surface, they launch a bloody rampage against humanity, feasting on the weak and gaining strength from their hapless victims' souls. It will take the power of ancient god Takegami, the Guardian of Darkness, to vanquish the evil serpent forces. But alas, the mortal body Takegami inhabits belongs to a host reluctant to fight. Genres: Action, Mecha, Supernatural, Drama, Sci-Fi"),
 Document(metadata={'row': 938, 'source': 'anime_updated.csv'}, page_content='combined_info: Title: Mutant Turtles: Choujin Densetsu-hen. Overview: hen the Teenage Mutant Ninja Turtles acquire Mutastones from Crys-Mu, the spirit of light, they acquire the ability to enhance themselves into Super Turtles for a duration of three minutes. Meanwhile, the evil Shredder and his minions Bebop a

In [30]:
queries = [
    "I'm looking for a romantic comedy anime. Any suggestions?",
    "Can you recommend an anime with strong female leads?",
    "What are some good sci-fi anime with space battles?",
    "I'm interested in anime that explore psychological themes.",
    "Suggest some anime movies with pirates."
]


In [32]:
for query in queries:
    print(f"Query: {query}")
    result = qa.invoke({"query": query})
    print("Recommendations:")
    print(result['result'])
    print("\n" + "="*50 + "\n")


Query: I'm looking for a romantic comedy anime. Any suggestions?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Recommendations:
Ai.


Query: Can you recommend an anime with strong female leads?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Recommendations:
No


Query: What are some good sci-fi anime with space battles?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Recommendations:
Tetsuwan Atom: Uchuu no Yuusha.


Query: I'm interested in anime that explore psychological themes.


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Recommendations:
I don't know


Query: Suggest some anime movies with pirates.


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Recommendations:
One Piece Movie 14: Stampede




In [7]:
import nbformat

def extract_code_from_ipynb(ipynb_file, output_file):
    with open(ipynb_file, 'r', encoding='utf-8') as file:
        notebook = nbformat.read(file, as_version=4)
        
    code_cells = [cell['source'] for cell in notebook['cells'] if cell['cell_type'] == 'code']
    
    with open(output_file, 'w', encoding='utf-8') as file:
            for i, code in enumerate(code_cells, 1):               
                file.write(f"# Cell {i}\n")
                file.write(code)
                file.write('\n\n')


# Replace 'notebook.ipynb' and 'output.py' with your file names
extract_code_from_ipynb('lc.ipynb', 'lc.py')