In [None]:
#!pip install -q langchain GitPython sentence-transformers chromadb huggingface_hub accelerate bitsandbytes

## Importing Necessary Libraries

In [3]:
from git import Repo
import os
import torch
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from huggingface_hub import notebook_login

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Cloning the GitHub Repository

In [5]:
url = 'https://github.com/ShowRounak/YT-Comments-Sentiment-Analysis-Using-BERT.git'
current_path = os.getcwd()
print('current path',current_path)
last_name = url.split('/')[-1]
print('last name',last_name)
clone_path = last_name.split('.')[0]
print('clone path',clone_path)
repo_path = os.path.join(current_path,clone_path)
print('path',repo_path)

if not os.path.exists(repo_path):
    repo = Repo.clone_from(url, to_path=repo_path)

current path /content
last name YT-Comments-Sentiment-Analysis-Using-BERT.git
clone path YT-Comments-Sentiment-Analysis-Using-BERT
path /content/YT-Comments-Sentiment-Analysis-Using-BERT


## Extracting all the files

In [6]:
allowed_extensions = ['.py', '.ipynb', '.md']

def extract_all_files(repo_path):
        root_dir = repo_path
        docs = []
        for dirpath, dirnames, filenames in os.walk(root_dir):
            for file in filenames:
                file_extension = os.path.splitext(file)[1]
                if file_extension in allowed_extensions:
                    try:
                        loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                        docs.extend(loader.load_and_split())
                    except Exception as e:
                        pass
        return docs

docs = extract_all_files(repo_path)

In [7]:
len(docs)

4

## Creating Embeddings

In [8]:
model_name = "all-MiniLM-L6-v2"
model_kwargs={'device': 'cpu'}


def chunk_files(docs):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
        texts = text_splitter.split_documents(docs)
        num_texts = len(texts)
        return texts

def create_embeddings(texts):
    embeddings = HuggingFaceEmbeddings(model_name= model_name,model_kwargs=model_kwargs)
    return embeddings

texts = chunk_files(docs)
embeddings = create_embeddings(texts)


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [9]:
chroma_path = f'{clone_path}-chroma'
print(chroma_path)

YT-Comments-Sentiment-Analysis-Using-BERT-chroma


In [10]:

def load_db(texts, embeddings):
    vectordb = Chroma.from_documents(texts, embedding=embeddings, persist_directory=chroma_path)
    vectordb.persist()
    return vectordb

vectordb = load_db(texts, embeddings)

## Question-answering using Llama2

In [11]:
torch.cuda.is_available()

True

In [12]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")


model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                             load_in_8bit=True,
                                              #load_in_4bit=True
                                             )

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [13]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=10,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [14]:
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0.1})

In [15]:
chain =  RetrievalQA.from_chain_type(llm=llm, chain_type = "stuff",return_source_documents=True, retriever=vectordb.as_retriever())

In [16]:
query = "what are the requirements of this repository?"
result=chain({"query": query}, return_only_outputs=True)

In [18]:
print(result['result'])


The requirements of this repository are:

* Python 3.x
* Google Developer API Key
* TensorFlow or PyTorch (choose based on your preference)

Note: The BERT model implementation is based on the Hugging Face Transformers library.


In [19]:
print(result['source_documents'])

[Document(page_content='# YouTube Comments Sentiment Analysis Using BERT\n\n## Overview\n\nThis project focuses on sentiment analysis of YouTube comments using BERT (Bidirectional Encoder Representations from Transformers). The goal is to understand the sentiment expressed in user comments on YouTube videos, categorizing them as positive, negative, or neutral.\n\n## Getting Started\n\nFollow these steps to get the project up and running on your local machine.\n\n### Prerequisites\n\n- Python 3.x\n- Google Developer API Key\n- TensorFlow or PyTorch (choose based on your preference)\n\n### Installation\n\n1. Clone the repository:\n\n   ```bash\n   git clone https://github.com/ShowRounak/YT-Comments-Sentiment-Analysis-Using-BERT.git\n   ```\n\n2. Navigate to the project directory:\n\n   ```bash\n   cd YT-Comments-Sentiment-Analysis-Using-BERT\n   ```\n\n3. Install the required packages:\n\n   ```bash\n   pip install -r requirements.txt\n   ```\n\n### Usage\n\n1. Explore the scraper.py fil

In [30]:
unknown_query = 'What is written in the scrapper file?'
result=chain({"query": unknown_query}, return_only_outputs=True)

In [31]:
result

{'result': ' The scrapper file contains the code for fetching the YouTube comments using the Google API. Specifically, it defines a function called `main` that takes in the YouTube video ID as an input and retrieves the comments for that video using the `google-api-python-client` library. The comments are then processed using the BERT sentiment analysis model.',
 'source_documents': [Document(page_content='import numpy as np\nimport pandas as pd\nimport streamlit as st\nfrom scraper import main\nfrom app import sentiment_score\nimport re\n\nst.title("Youtube Video Comments Sentiment Analysis")\n\nst.subheader(\'Insert Youtube video URL\')\ninput_url = st.text_area("enter url")\n\ndef extract_video_id(input_url):\n    pattern = r\'(?:v=|\\/)([0-9A-Za-z_-]{11}).*\'\n    match = re.search(pattern, input_url)\n    if match:\n        return match.group(1)\n    return None\n\ninput_id = extract_video_id(input_url)\n\nif st.button(\'Analysis\'):\n    comments = main(input_id)\n    df = pd.Dat

In [29]:
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

retriever=vectordb.as_retriever()
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is written in the scrapper file?")

' The scrapper file extracts the YouTube video ID from the input URL and uses it to fetch the comments using the Google API. The comments are then processed using BERT to extract the sentiment.'