In [None]:
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings

import pandas as pd
import yaml

In [None]:
OPENAI_API_KEY = yaml.safe_load(open('../credentials.yml'))['openai']

In [None]:
youtube_df = pd.read_csv('data/youtube_videos.csv')

youtube_df.head()

Unnamed: 0,source,title,description,view_count,thumbnail_url,publish_date,length,author,video_url,page_content
0,3yiHZWr6Izc,A Complete Marketing Strategy In 3 Minutes,Unknown,166839,https://i.ytimg.com/vi/3yiHZWr6Izc/hq720.jpg,2023-04-22 00:00:00,191,GaryVee,https://www.youtube.com/watch?v=3yiHZWr6Izc,the framework for me doing social brand buildi...
1,91D5hjMEADg,55 Minutes of Social Media Content Strategy fo...,Unknown,777650,https://i.ytimg.com/vi/91D5hjMEADg/hq720.jpg,2022-12-08 00:00:00,3257,Think Media,https://www.youtube.com/watch?v=91D5hjMEADg,there's an unspoken question in every person's...
2,UoJh7R8t5Aw,How To Build A Successful Brand Through Social...,Unknown,441927,https://i.ytimg.com/vi/UoJh7R8t5Aw/hq720.jpg,2022-10-03 00:00:00,1303,GaryVee,https://www.youtube.com/watch?v=UoJh7R8t5Aw,I can't imagine being a common sense human bei...
3,lqmA-LrQzcY,The Ultimate Social Media Marketing Strategy,Unknown,361175,https://i.ytimg.com/vi/lqmA-LrQzcY/hq720.jpg,2023-09-11 00:00:00,2720,GaryVee,https://www.youtube.com/watch?v=lqmA-LrQzcY,apple does not sell Nike does not sell most of...
4,i-eIWetPKjM,How To Grow Your Brand On Social Media In 2024,Unknown,189064,https://i.ytimg.com/vi/i-eIWetPKjM/hq720.jpg,2023-09-25 00:00:00,2091,WRLDINVSN Network,https://www.youtube.com/watch?v=i-eIWetPKjM,you have to understand that no matter what it ...


In [None]:
youtube_df['page_content'] = youtube_df['page_content'].str.replace('\n\n', '\n', regex=False)

In [None]:
loader = DataFrameLoader(youtube_df, page_content_column='page_content')

documents = loader.load()

In [None]:
documents[0].metadata

{'source': '3yiHZWr6Izc',
 'title': 'A Complete Marketing Strategy In 3 Minutes',
 'description': 'Unknown',
 'view_count': 166839,
 'thumbnail_url': 'https://i.ytimg.com/vi/3yiHZWr6Izc/hq720.jpg',
 'publish_date': '2023-04-22 00:00:00',
 'length': 191,
 'author': 'GaryVee',
 'video_url': 'https://www.youtube.com/watch?v=3yiHZWr6Izc'}

In [None]:
documents[0].page_content

"the framework for me doing social brand building and building businesses is what's the brand stand for what's the business objective the next thing we do is we create 40 to 50 consumer segmentations not three right 40 to 50 consumer segmentations 18 to 22 year old males limbing in Bangladesh that are into Esports 21 to 27 year old females in Tokyo making 200 000 a year 40 to 45 year old moms in Malaysia who have an affinity towards High fashion real specific consumer cohorts with real teeth okay next on that is called pack platforms and culture like do you actually understand what the platforms are doing next which platforms are you creating for you've now gone the whole way down now it's we're going to pick Instagram and Facebook and Twitter and for me every brand that's in this room should be on all of them but this is back to allocation more to their dollars go so they can't but they should but they'd rather spend eight million dollars on a TV spot so you lay out the platforms then

In [None]:
len(documents)

10

In [None]:
CHUNK_SIZE = 1000

In [None]:
# Character Splitter: Splits on simple default of 
text_splitter = CharacterTextSplitter(
    chunk_size=CHUNK_SIZE, 
    # chunk_overlap=100,
    separator="\n"
)

docs = text_splitter.split_documents(documents)

len(docs)

85

In [None]:
# Recursive Character Splitter: Uses "smart" splitting, and recursively tries to split until text is small enough
text_splitter_recursive = RecursiveCharacterTextSplitter(
    chunk_size = CHUNK_SIZE,
    chunk_overlap=100,
)

docs_recursive = text_splitter_recursive.split_documents(documents)

len(docs_recursive)

290

In [None]:
embedding_function = OpenAIEmbeddings(
    model='text-embedding-ada-002',
    api_key=OPENAI_API_KEY
)

In [None]:
embedding_function

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7fdf883dc490>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7fdf79524310>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None)

In [None]:
vectorstore = Chroma.from_documents(docs, embedding=embedding_function, persist_directory="data/chroma.db")

vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x7fdf795387f0>

In [None]:
result = vectorstore.similarity_search("How to create a social media strategy")

In [None]:
result = vectorstore.similarity_search("How to create a social media strategy", k = 4)

In [None]:
result[0].page_content

"these days it is standard practice to use social media marketing if you're running a business pretty much every online business and most offline businesses as well try to use social media to get customers online however the sad truth is that especially most small businesses don't really have any sort of strategy they are just posting content randomly or trying to copy their competitors and the end result is that they don't create any real results meaning it doesn't affect their bottom line positively and that it's a massive waste of time because we all know how time-consuming social media can be my name is gillian perkins i'm an online business strategist and i specialize in digital marketing strategy i'm also the founder of startup society which is an online training program for digital entrepreneurs in this video i'm going to teach you the process that i use to develop our social media strategy every year and this is how i've created the strategy that has grown our followers on soci

In [None]:
from pprint import pprint

In [None]:
pprint(result[0].page_content)

("these days it is standard practice to use social media marketing if you're "
 'running a business pretty much every online business and most offline '
 'businesses as well try to use social media to get customers online however '
 "the sad truth is that especially most small businesses don't really have any "
 'sort of strategy they are just posting content randomly or trying to copy '
 "their competitors and the end result is that they don't create any real "
 "results meaning it doesn't affect their bottom line positively and that it's "
 'a massive waste of time because we all know how time-consuming social media '
 "can be my name is gillian perkins i'm an online business strategist and i "
 "specialize in digital marketing strategy i'm also the founder of startup "
 'society which is an online training program for digital entrepreneurs in '
 "this video i'm going to teach you the process that i use to develop our "
 "social media strategy every year and this is how i've created 