<a href="https://colab.research.google.com/github/Rewanthnayak/Jupyer_notebooks/blob/main/lang_chain_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain chromadb sentence_transformers pinecone-client -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.1/179.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m283.7/283.7 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import io
import urllib
import zipfile
import pandas as pd

In [None]:
def load_cornell_movie_dialogs():
    """Loads the Cornell Movie Dialogue Corpus into a Pandas DataFrame."""

    url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
    file_path = "cornell_movie_dialogs.zip"

    if not os.path.exists(file_path):
        print("Downloading Cornell Movie Dialogue Corpus...")
        urllib.request.urlretrieve(url, file_path)

    with zipfile.ZipFile(file_path) as zip_file:
        with zip_file.open('cornell movie-dialogs corpus/movie_lines.txt') as lines_file:
            lines = lines_file.readlines()
            lines = [line.decode("unicode_escape") for line in lines]

        with zip_file.open('cornell movie-dialogs corpus/movie_conversations.txt') as conversations_file:
            conversations = conversations_file.readlines()
            conversations = [conv.decode("unicode_escape") for conv in conversations]

        with zip_file.open('cornell movie-dialogs corpus/movie_characters_metadata.txt') as characters_data:
             characters = characters_data.readlines()
             characters = [character.decode("unicode_escape") for character in characters]

        with zip_file.open('cornell movie-dialogs corpus/movie_titles_metadata.txt') as titles_metadata:
             titles= titles_metadata.readlines()
             titles = [title.decode("unicode_escape") for title in titles]

    lines_df = pd.DataFrame({
        "LineID": [line.split(" +++$+++ ")[0] for line in lines],
        "Character": [line.split(" +++$+++ ")[1] for line in lines],
        "Movie": [line.split(" +++$+++ ")[2] for line in lines],
        "Name": [line.split(" +++$+++ ")[3] for line in lines],
        "Line": [line.split(" +++$+++ ")[4] for line in lines],
    })

    conv_df = pd.DataFrame({
        "characterID_1" : [conversation.split(" +++$+++ ")[0] for conversation in conversations],
        "characterID_2" : [conversation.split(" +++$+++ ")[1] for conversation in conversations],
        "movieID" : [conversation.split(" +++$+++ ")[2] for conversation in conversations],
        "utterances" : [conversation.split(" +++$+++ ")[3] for conversation in conversations]
    })

    characters_df = pd.DataFrame({
        "characterID" :  [character.split(" +++$+++ ")[0] for character in characters]
        ,"character_name" :  [character.split(" +++$+++ ")[1] for character in characters]
        ,"movieID" :  [character.split(" +++$+++ ")[2] for character in characters]
        ,"movie_title" :  [character.split(" +++$+++ ")[3] for character in characters]
        ,"gender" :  [character.split(" +++$+++ ")[4] for character in characters]
        ,"position" : [character.split(" +++$+++ ")[5] for character in characters]
    })

    titles_df = pd.DataFrame({
         "movieID" : [title.split(" +++$+++ ")[4] for title in titles],
         "movie_title" : [title.split(" +++$+++ ")[4] for title in titles],
         "movie_year" : [title.split(" +++$+++ ")[4] for title in titles],
         "IMDB_rating" : [title.split(" +++$+++ ")[4] for title in titles],
         "no_IMDB_votes" : [title.split(" +++$+++ ")[4] for title in titles],
         "genres" : [title.split(" +++$+++ ")[4] for title in titles]
    })
    return lines_df, conv_df, characters_df, titles_df

lines_df, conv_df, characters_df, titles_df = load_cornell_movie_dialogs()


In [None]:
for df in [lines_df, conv_df, characters_df, titles_df]:
    print(df.head())
    print(df.isnull().sum())

  LineID Character Movie     Name            Line
0  L1045        u0    m0   BIANCA  They do not!\n
1  L1044        u2    m0  CAMERON   They do to!\n
2   L985        u0    m0   BIANCA    I hope so.\n
3   L984        u2    m0  CAMERON     She okay?\n
4   L925        u0    m0   BIANCA     Let's go.\n
LineID       0
Character    0
Movie        0
Name         0
Line         0
dtype: int64
  characterID_1 characterID_2 movieID                          utterances
0            u0            u2      m0  ['L194', 'L195', 'L196', 'L197']\n
1            u0            u2      m0                  ['L198', 'L199']\n
2            u0            u2      m0  ['L200', 'L201', 'L202', 'L203']\n
3            u0            u2      m0          ['L204', 'L205', 'L206']\n
4            u0            u2      m0                  ['L207', 'L208']\n
characterID_1    0
characterID_2    0
movieID          0
utterances       0
dtype: int64
  characterID character_name movieID                 movie_title gender  \
0   

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size = 512,
    chunk_overlap  = 25,
    length_function = len,
)

In [None]:
def line_splitter(row):
    val = row.to_dict()
    line = row['Line']
    return text_splitter.create_documents([line],metadatas=[val])

In [None]:
result = lines_df.apply(line_splitter,axis=1).to_list()

In [None]:
docs = []
for doc in result:
    docs.extend(doc)

In [None]:
embed = embeddings.embed_query('test input')
print(embed)
print(len(embed))

[-0.0012785366270691156, -0.057259321212768555, -0.04266733303666115, 0.021370600908994675, -0.00612323684617877, -0.034934476017951965, 0.04569074511528015, 0.0461890771985054, 0.04373163357377052, -0.006611580029129982, 0.06860867142677307, -0.020333420485258102, 0.003211568808183074, 0.07795347273349762, 0.0016347031341865659, -0.0006740169483236969, 0.04426872357726097, -0.004118894226849079, -0.048180729150772095, 0.014016568660736084, 0.0019272296922281384, -0.033957481384277344, -0.02372012287378311, -0.024719208478927612, -0.052053097635507584, 0.023985954001545906, -0.03307997062802315, 0.0037761032581329346, 0.00237098871730268, -0.015099096111953259, -0.03508111834526062, 0.010579562745988369, -0.015198392793536186, 0.0010682055726647377, 1.5197737184280413e-06, -0.034996647387742996, 0.021682746708393097, -0.020703349262475967, -0.05291276425123215, -0.017239369451999664, -0.009424169547855854, 0.08681726455688477, -0.02715783379971981, 0.03159133344888687, 0.00649129785597

In [None]:
# pinecone.create_index(name='movie-dialogues', dimension=768,metric='cosine')


In [None]:
index = pinecone.Index('movie-dialogues')

In [None]:
import pinecone

# initialize pinecone
pinecone.init(
    api_key='f651e399-7355-4972-9595-afb52787ecf1',   # find at app.pinecone.io
    environment='asia-southeast1-gcp-free'            # next to api key in console
)


index_name = "movie-dialogues"

docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [None]:
type(docs[0])

In [None]:
db = Chroma.from_documents(docs, embeddings)

In [None]:
query = "the name is nayak, rewanth nayak"
result = db.similarity_search(query)