# NOTEBOOK MUST BE RUN IN ENV: netflix_proj_env

In [2]:
#Confirm in the correct env
!conda env list

# conda environments:
#
                         C:\Program Files\Orange
base                     C:\ProgramData\Anaconda3
ChatDev_conda_env        C:\Users\PhillipRashaad\.conda\envs\ChatDev_conda_env
PandasProfileEnv         C:\Users\PhillipRashaad\.conda\envs\PandasProfileEnv
SMOP_env                 C:\Users\PhillipRashaad\.conda\envs\SMOP_env
autogen_autobuild_env     C:\Users\PhillipRashaad\.conda\envs\autogen_autobuild_env
autogen_studio_env       C:\Users\PhillipRashaad\.conda\envs\autogen_studio_env
automemgpt_env           C:\Users\PhillipRashaad\.conda\envs\automemgpt_env
classy_env               C:\Users\PhillipRashaad\.conda\envs\classy_env
crewai_env               C:\Users\PhillipRashaad\.conda\envs\crewai_env
crewai_poetry_env        C:\Users\PhillipRashaad\.conda\envs\crewai_poetry_env
cupy_tut                 C:\Users\PhillipRashaad\.conda\envs\cupy_tut
datascienv               C:\Users\PhillipRashaad\.conda\envs\datascienv
flask_env                C:\Users\PhillipRash

# 1. LOAD DATA

In [41]:
#Load Preprocessed netflix engagement data with is_original column
import pandas as pd

unique_titles_df = pd.read_csv("04 - PREPROCESSED_DATA - Netflix_Engagement_Data.csv")

print(unique_titles_df.shape)

unique_titles_df.head()

(15595, 3)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len
0,naruto shippuden,21,16
1,greys anatomy,19,13
2,heartland 2007,16,14
3,gogglebox,16,9
4,ncis,15,4


In [32]:
# #Only including titles that have 10 or more charachters or have multiple title counts
# unique_titles_df = unique_titles_df[(unique_titles_df['Clean_Title_Char_Len']>=10) | (unique_titles_df['Title_Counts']>=2)]

# print(unique_titles_df.shape)

# unique_titles_df.head()

(13654, 3)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len
0,naruto shippuden,21,16
1,greys anatomy,19,13
2,heartland 2007,16,14
3,gogglebox,16,9
4,ncis,15,4


# 2. CUSTOM CLASS: `Netflix_Few_Shot_Class`


This class is designed to facilitate the creation and execution of few-shot learning prompts
using the LangChain library, specifically tailored for determining details about Netflix titles.

It encapsulates the process of defining a prompt template, setting up few-shot examples,
initializing a language learning model (LLM), and parsing the output.

Methods:
- create_example_prompt(template, input_variables): Defines the prompt template with placeholders for input variables.
- create_few_shot_examples(examples_list): Sets up few-shot examples based on the provided list.
- load_llm_model(llm_model): Loads the specified LLM model for generating responses.
- load_parser_object(): Initializes an output parser for parsing the response from the LLM.
- chain_components(): Chains the few-shot prompt, LLM model, and output parser together for execution.
- run_chain(input_question): Executes the chained components with the given input question, returning the LLM's response.

Usage:
1. Initialize the class with an API key for OpenAI.
2. Call `create_example_prompt` with a template string and list of input variables.
3. Call `create_few_shot_examples` with a list of examples matching the template structure.
4. Create a `ChatOpenAI` object with your API key and preferred model, then pass it to `load_llm_model`.
5. Call `load_parser_object` to initialize the output parser.
6. Call `chain_components` to prepare the execution pipeline.
7. Use `run_chain` with a specific question to get the processed response from the LLM.


## 2A. Instantiate Netflix class with openai api key

In [7]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [9]:
#Import the custom Netflix class object
from backend.netflix_llm_chain import Netflix_Few_Shot_Class

#Initiate class with openai api key
netflix_class = Netflix_Few_Shot_Class(api_key=os.getenv('OPENAI_API_KEY'))

netflix_class

<backend.netflix_llm_chain.Netflix_Few_Shot_Class at 0x21836ff9990>

## 2B. Few-Shot Prompt Creation - Few-Shot Prompting using LangChain object `PromptTemplate`

In [10]:
#STEP 1: Create prompt template using class method create_example_prompt
netflix_class.create_example_prompt(template="Question: {question}\nAnswer {answer}", input_variables=['question', 'answer'])

netflix_class.netflix_example_prompt

PromptTemplate(input_variables=['answer', 'question'], template='Question: {question}\nAnswer {answer}')

In [11]:
#STEP 2: Create prompt few-shot examples using class method create_few_shot_examples

#2A. Add example dicts to list obejct variable
orignal_only_examples = [{"question": "Is title 'stranger things' original Netflix content?", "answer":"Yes"},
                         {"question": "Is title 'trolls' original Netflix content?","answer":"No"},
                         {"question": "Is title 'money shot the pornhub story' original Netflix content?", "answer":"Yes"},
                         {"question": "Is title 'avatar the last airbender' original Netflix content?","answer":"No"}
                         ]

orignal_only_examples

[{'question': "Is title 'stranger things' original Netflix content?",
  'answer': 'Yes'},
 {'question': "Is title 'trolls' original Netflix content?", 'answer': 'No'},
 {'question': "Is title 'money shot the pornhub story' original Netflix content?",
  'answer': 'Yes'},
 {'question': "Is title 'avatar the last airbender' original Netflix content?",
  'answer': 'No'}]

In [12]:
#2B. Load prompt few-shot examples using  create_few_shot_examples
netflix_class.create_few_shot_examples(examples_list=orignal_only_examples)

netflix_class.few_shot_prompt

TOTAL FEW-SHOT EXAMPLES: 4




FewShotPromptTemplate(input_variables=['input'], examples=[{'question': "Is title 'stranger things' original Netflix content?", 'answer': 'Yes'}, {'question': "Is title 'trolls' original Netflix content?", 'answer': 'No'}, {'question': "Is title 'money shot the pornhub story' original Netflix content?", 'answer': 'Yes'}, {'question': "Is title 'avatar the last airbender' original Netflix content?", 'answer': 'No'}], example_prompt=PromptTemplate(input_variables=['answer', 'question'], template='Question: {question}\nAnswer {answer}'), suffix='Question: {input}')

## 2C. LLM Creation - Instantiate LangChain ChatOpenAI Object

- Instantiated a ChatOpenAI object from LangChain to be used as the LLM Model with the api_key and model name.

In [16]:
!pip install langchain-openai

Collecting langchain-openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-core<0.3,>=0.1.46 (from langchain-openai)
  Downloading langchain_core-0.2.0-py3-none-any.whl.metadata (5.9 kB)
Collecting openai<2.0.0,>=1.24.0 (from langchain-openai)
  Downloading openai-1.30.1-py3-none-any.whl.metadata (21 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.7.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken<1,>=0.7->langchain-openai)
  Downloading regex-2024.5.15-cp310-cp310-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     -------------------------------------  41.0/42.0 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 511.8 kB/s eta 0:00:00
Downloading langchain_openai-0.1.7-py3-none-any.whl (34 kB)
Downloading langchain_core-0.2.0-py3-none-any.whl (307 kB)
   --------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.1.11 requires langchain-core<0.2,>=0.1.29, but you have langchain-core 0.2.0 which is incompatible.
langchain-community 0.0.27 requires langchain-core<0.2.0,>=0.1.30, but you have langchain-core 0.2.0 which is incompatible.
langchain-text-splitters 0.0.1 requires langchain-core<0.2.0,>=0.1.28, but you have langchain-core 0.2.0 which is incompatible.


In [None]:
#STEP 3. Create a `ChatOpenAI` object with your API key and preferred model, then pass it to `load_llm_model`.
##EXAMPLE GPT-3.5

#3A. Create LLM model object
#Load ChatOpenAI object from langchain
from langchain_community.chat_models import ChatOpenAI
#from langchain_openai import ChatOpenAI


#Create object with api_key & model_name as input variables
llm_model_3_5 = ChatOpenAI( api_key=os.getenv('OPENAI_API_KEY'), model_name="gpt-3.5-turbo",   temperature=0)

#llm_model_3_5

In [19]:
%%time

##EXAMPLE GPT-3.5
### TIME - Wall time: 1.26 s ---NOTE: Runs 9x faster than gpt-3.5
#Confirm gpt-3.5 model is working
llm_model_3_5.invoke("Is title 'spongebob squarepants' original Netflix content?")

CPU times: total: 703 ms
Wall time: 1.76 s


AIMessage(content='No, "SpongeBob SquarePants" is not original Netflix content. It is a popular animated television series created by Stephen Hillenburg that originally aired on Nickelodeon.')

In [21]:
### THIS IS WHERE I CHOOSE OPENAI MODEL TO USE
#3B. Use load desired model using class method load_llm_model
netflix_class.load_llm_model(llm_model_3_5)

#netflix_class.llm_model

## 2D. Load LangChain Output Parser `StrOutputParser()`
- Loaded a parser object StrOutputParser() from the LangChain library to parse the responses from the OpenAI API.

In [22]:
#Confirm parser is instantiated within Netflix Class
netflix_class.output_parser

StrOutputParser()

## 2E. Create Few-Shot LLMChain
- Created a LangChain few-shot LLMChain using FewShotPromptTemplate, ChatOpenAI, and StrOutputParser() as the components.

In [None]:
#4. Call `chain_components` to prepare the execution pipeline. NOTE: run method without ()

#Use method to create LLMchain within Class object
netflix_class.chain_components()

#Show method output
netflix_class.parser_chain

## 2F. Conirm LLM Chain is working

In [24]:
%%time

#5. Use `run_chain` with a specific question to get the processed response from the LLM.
class_response = netflix_class.run_chain("Is title 'spongebob squarepants' original Netflix content?")

class_response

CPU times: total: 31.2 ms
Wall time: 786 ms


{'input': "Is title 'spongebob squarepants' original Netflix content?",
 'text': 'Answer No'}

# 3. DATA ENHANCEMENT

## 3A. **is_original**: Indicates whether a title is original Netflix content.

In [42]:
#a function that takes input string paramerter “input_title” and returns string "Is title ‘{input_title}' original Netflix content?"

def format_title_question_isoriginal(input_title):
  """
  This function takes an input string parameter "input_title" and returns a string in the format:
  "Is title '{input_title}' original Netflix content?"
  """
  return f"Is title '{input_title}' original Netflix content?"



In [43]:
# Example usage
input_title = "Stranger Things"
formatted_question = format_title_question_isoriginal(input_title)
print(formatted_question)


Is title 'Stranger Things' original Netflix content?


In [44]:
#USE function to create Clean_Title_Prompt to be used with LLLM model
unique_titles_df['is_original_Prompt'] = unique_titles_df['Clean_Title'].apply(format_title_question_isoriginal)


print(unique_titles_df.shape)

unique_titles_df.head()

(15595, 4)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?
4,ncis,15,4,Is title 'ncis' original Netflix content?


In [45]:
#Use class on single prompt in df
single_response = netflix_class.run_chain(unique_titles_df["is_original_Prompt"].iloc[0])

single_response

{'input': "Is title 'naruto shippuden' original Netflix content?",
 'text': 'Answer No'}

## 3B. **content_type**: Categorizes the title as a "Film", "Series", or "Documentary".

In [46]:
#FUNCTION 1
def format_title_question_content_type(input_title):
  """
  This function takes an input string parameter "input_title" and returns a string in the format:
  "Is Netflix title '{input_title}' a Series, Movie, or Documentary?"
  """
  return f"Is Netflix title '{input_title}' a Series, Movie, or Documentary?"



In [47]:
# Example usage
input_title = "Stranger Things"
formatted_question = format_title_question_content_type(input_title)
print(formatted_question)


Is Netflix title 'Stranger Things' a Series, Movie, or Documentary?


In [48]:
#STEP 1. CREATE COLUMN WITH QUESTION FORMATTER
unique_titles_df['Content_Type_Prompt'] = unique_titles_df['Clean_Title'].apply(format_title_question_content_type)

print(unique_titles_df.shape)

unique_titles_df.head()

(15595, 5)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt,Content_Type_Prompt
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...,"Is Netflix title 'naruto shippuden' a Series, ..."
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...,"Is Netflix title 'greys anatomy' a Series, Mov..."
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...,"Is Netflix title 'heartland 2007' a Series, Mo..."
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?,"Is Netflix title 'gogglebox' a Series, Movie, ..."
4,ncis,15,4,Is title 'ncis' original Netflix content?,"Is Netflix title 'ncis' a Series, Movie, or Do..."


In [49]:
#STEP 2B. Confirm the output for single response is a string
#PHIL #3: Added another method run_chain_text_only to return only text from parser object
single_response_text = netflix_class.run_chain_text_only(unique_titles_df["Content_Type_Prompt"].iloc[0])

single_response_text

'Answer: Series'

In [50]:
#Save to csv
unique_titles_df.to_csv('06 - PROMPT_DATA - Netflix_Engagement_Data.csv', index=False)

## 3C. SAMPLE RUN
- Use the Netflix LLM chain class to enhance data for a sample of 20 rows to verify its functionality.

In [54]:
#STEP 2A. Use sample to experiment with and confirm class functio works
sample_df = unique_titles_df.head(20).copy()

print(sample_df.shape)

sample_df.head()

(20, 5)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt,Content_Type_Prompt
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...,"Is Netflix title 'naruto shippuden' a Series, ..."
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...,"Is Netflix title 'greys anatomy' a Series, Mov..."
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...,"Is Netflix title 'heartland 2007' a Series, Mo..."
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?,"Is Netflix title 'gogglebox' a Series, Movie, ..."
4,ncis,15,4,Is title 'ncis' original Netflix content?,"Is Netflix title 'ncis' a Series, Movie, or Do..."


In [55]:
%%time

#Create Prompt_Response with class method run_chain_text_only
sample_df['is_original'] = sample_df["is_original_Prompt"].apply(netflix_class.run_chain_text_only)

print(sample_df.shape)

sample_df

(20, 6)
CPU times: total: 2.38 s
Wall time: 11.5 s


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt,Content_Type_Prompt,is_original
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...,"Is Netflix title 'naruto shippuden' a Series, ...",Answer No
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...,"Is Netflix title 'greys anatomy' a Series, Mov...",Answer No
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...,"Is Netflix title 'heartland 2007' a Series, Mo...",Answer No
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?,"Is Netflix title 'gogglebox' a Series, Movie, ...",Answer No
4,ncis,15,4,Is title 'ncis' original Netflix content?,"Is Netflix title 'ncis' a Series, Movie, or Do...",Answer No
5,murdoch mysteries,15,17,Is title 'murdoch mysteries' original Netflix ...,"Is Netflix title 'murdoch mysteries' a Series,...",Answer No
6,supernatural 2005,15,17,Is title 'supernatural 2005' original Netflix ...,"Is Netflix title 'supernatural 2005' a Series,...",Answer No
7,its always sunny in philadelphia,15,32,Is title 'its always sunny in philadelphia' or...,Is Netflix title 'its always sunny in philadel...,Answer No
8,archer 2009,13,11,Is title 'archer 2009' original Netflix content?,"Is Netflix title 'archer 2009' a Series, Movie...",Answer No
9,two and a half men,12,18,Is title 'two and a half men' original Netflix...,Is Netflix title 'two and a half men' a Series...,Answer No


In [56]:
%%time

#STEP 2C. Confirm it works on entire df column
#Create Prompt_Response with class method run_chain_text_only
#NOTE: 20 rows = 11s for model gpt-3.5
sample_df['content_type'] = sample_df["Content_Type_Prompt"].apply(netflix_class.run_chain_text_only)

print(sample_df.shape)

sample_df.head(10)

(20, 7)
CPU times: total: 1.62 s
Wall time: 12.2 s


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt,Content_Type_Prompt,is_original,content_type
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...,"Is Netflix title 'naruto shippuden' a Series, ...",Answer No,Answer: Series
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...,"Is Netflix title 'greys anatomy' a Series, Mov...",Answer No,Answer: Series
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...,"Is Netflix title 'heartland 2007' a Series, Mo...",Answer No,Answer: Series
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?,"Is Netflix title 'gogglebox' a Series, Movie, ...",Answer No,Answer: Series
4,ncis,15,4,Is title 'ncis' original Netflix content?,"Is Netflix title 'ncis' a Series, Movie, or Do...",Answer No,Answer: Series
5,murdoch mysteries,15,17,Is title 'murdoch mysteries' original Netflix ...,"Is Netflix title 'murdoch mysteries' a Series,...",Answer No,Answer: Series
6,supernatural 2005,15,17,Is title 'supernatural 2005' original Netflix ...,"Is Netflix title 'supernatural 2005' a Series,...",Answer No,Answer: Series
7,its always sunny in philadelphia,15,32,Is title 'its always sunny in philadelphia' or...,Is Netflix title 'its always sunny in philadel...,Answer No,Answer: Series
8,archer 2009,13,11,Is title 'archer 2009' original Netflix content?,"Is Netflix title 'archer 2009' a Series, Movie...",Answer No,Answer: Series
9,two and a half men,12,18,Is title 'two and a half men' original Netflix...,Is Netflix title 'two and a half men' a Series...,Answer No,Answer: Series


## 3D. BATCH PROCESSING
- To avoid timeout errors and API rate limits, data enhancement must be performed through batch calls.

In [51]:
#FUNCTION 2
import pandas as pd

def chunk_dataframe(df, row_chunk=1000):
    """
    Splits a DataFrame into chunks of a specified size.

    Parameters:
    - df: pandas.DataFrame, the DataFrame to split.
    - row_chunk: int, the number of rows each chunk should have.

    Returns:
    - List[pd.DataFrame], a list of DataFrame chunks.
    """
    # Ensure row_chunk is a positive integer
    if row_chunk <= 0:
        raise ValueError("row_chunk must be a positive integer")

    # Calculate the number of chunks
    num_chunks = (len(df) + row_chunk - 1) // row_chunk  # Ceiling division

    # Split the DataFrame into chunks
    df_chunks = [df.iloc[i*row_chunk : (i+1)*row_chunk] for i in range(num_chunks)]

    return df_chunks



In [52]:
#FUNCTION 3
#ADDED Dynmaic Input parameters for easier use with LLL Few-Show functions

def process_and_save_dataframes(df_chunks, classobjectfunc, files_folder_path, prompt_col='Content_Type_Prompt', output_col='content_type'):
    """Process a list of DataFrames by applying a custom function and save to CSV.

    Args:
        df_chunks (list): List of DataFrame chunks.
        netflix_class: An object that has a method `run_chain_text_only` for processing.
    """
    chunk_size = df_chunks[0].shape[0]

    chunk_counter_name = 0

    #empty list to caputre output dfs
    output_df_list = []

    for index, df in enumerate(df_chunks):
        chunk_counter_name = index *chunk_size + chunk_size


        print(f'IDX:--{index}--{chunk_counter_name}\n\n')
        # Apply the custom function to create the 'is_original' column
        df[output_col] = df[prompt_col].apply(classobjectfunc)

        # Save the modified DataFrame to a CSV file
        df.to_csv(files_folder_path+f"10 - E - df_chunk_{index}_{chunk_counter_name}.csv", index=False)

        output_df_list.append(df)

    return output_df_list



In [53]:
#STEP 3. SPLIT DF INTO CHUNKS
#Use function to chunk unique titles df
chunks_df_list = chunk_dataframe(df=unique_titles_df, row_chunk=1000)

print('TOTAL DF CHUNKS: ',len(chunks_df_list))

#Show df shape
print(chunks_df_list[0].shape)

#Show df head
chunks_df_list[0].head()


TOTAL DF CHUNKS:  16
(1000, 5)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Char_Len,is_original_Prompt,Content_Type_Prompt
0,naruto shippuden,21,16,Is title 'naruto shippuden' original Netflix c...,"Is Netflix title 'naruto shippuden' a Series, ..."
1,greys anatomy,19,13,Is title 'greys anatomy' original Netflix cont...,"Is Netflix title 'greys anatomy' a Series, Mov..."
2,heartland 2007,16,14,Is title 'heartland 2007' original Netflix con...,"Is Netflix title 'heartland 2007' a Series, Mo..."
3,gogglebox,16,9,Is title 'gogglebox' original Netflix content?,"Is Netflix title 'gogglebox' a Series, Movie, ..."
4,ncis,15,4,Is title 'ncis' original Netflix content?,"Is Netflix title 'ncis' a Series, Movie, or Do..."


In [57]:
csv_folder_path = 'csv_chunks/'

csv_folder_path

'csv_chunks/'

In [None]:
%%time
#NOTE: 14.6k rows took 1hr 49mins to run

#STEP 4B. Use function to loop df list of chunnks
#Make sure to enter ALL input parameters

final_df_list = process_and_save_dataframes(df_chunks=chunks_df_list , classobjectfunc=netflix_class.run_chain_text_only,
                                            files_folder_path=csv_folder_path, prompt_col='Content_Type_Prompt', output_col='content_type')

print(len(final_df_list))

## 3E. CONSOLIDATE BATCH FILES

In [58]:
import os
import pandas as pd

def load_csv_files_to_dataframe(directory):
    """
    Load all CSV files in the specified directory into a single dataframe.
    
    Parameters:
    directory (str): The path to the directory containing the CSV files.
    
    Returns:
    pd.DataFrame: A dataframe containing the concatenated data from all CSV files.
    """
    # List all CSV files in the directory
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    
    # Read each CSV file into a dataframe and store them in a list
    dataframes = []
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)
    
    # Concatenate all dataframes into one
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    return combined_df




In [59]:
csv_folder_path

'csv_chunks/'

In [60]:
# Example usage:
# directory = 'csv_chunks/'
combined_df = load_csv_files_to_dataframe(directory=csv_folder_path)

print(combined_df.shape)

combined_df

(14608, 7)


Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Prompt,is_original,Clean_Title_Char_Len,Content_Type_Prompt,content_type
0,naruto shippuden,21,Is title 'naruto shippuden' original Netflix c...,Answer No,16,"Is Netflix title 'naruto shippuden' a Series, ...",Answer Series
1,greys anatomy,19,Is title 'greys anatomy' original Netflix cont...,Answer No,13,"Is Netflix title 'greys anatomy' a Series, Mov...",Answer Series
2,gogglebox,16,Is title 'gogglebox' original Netflix content?,Answer No,9,"Is Netflix title 'gogglebox' a Series, Movie, ...",Answer Series
3,heartland 2007,16,Is title 'heartland 2007' original Netflix con...,Answer No,14,"Is Netflix title 'heartland 2007' a Series, Mo...",Answer Series
4,its always sunny in philadelphia,15,Is title 'its always sunny in philadelphia' or...,Answer No,32,Is Netflix title 'its always sunny in philadel...,Answer Series
...,...,...,...,...,...,...,...
14603,rage,1,Is title 'rage' original Netflix content?,Answer No,4,"Is Netflix title 'rage' a Series, Movie, or Do...",Answer Movie
14604,jessabelle,1,Is title 'jessabelle' original Netflix content?,Answer No,10,"Is Netflix title 'jessabelle' a Series, Movie,...",Answer Movie
14605,mnnerhort,1,Is title 'mnnerhort' original Netflix content?,Answer No,9,"Is Netflix title 'mnnerhort' a Series, Movie, ...",Answer Movie
14606,vadh,1,Is title 'vadh' original Netflix content?,Answer No,4,"Is Netflix title 'vadh' a Series, Movie, or Do...",Answer Movie


In [61]:
#Save to csv
combined_df.to_csv('07 - ENHANCED_DATA - Netflix_Engagement_Date.csv', index=False)

# 4. CLEAN ENHANCEMENT DATA

## 4A CLEAN COLUMN - is_original

- Should only include "Yes" or "No"

In [64]:
def check_string_yes_no(s):
    #Make sure it is string and remove case sensitive
    s = str(s).lower()
    if "yes" in s:
        return "Yes"
    elif "no" in s:
        return "No"
    else:
        return ""


In [65]:
combined_df.head()

Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Prompt,is_original,Clean_Title_Char_Len,Content_Type_Prompt,content_type
0,naruto shippuden,21,Is title 'naruto shippuden' original Netflix c...,Answer No,16,"Is Netflix title 'naruto shippuden' a Series, ...",Answer Series
1,greys anatomy,19,Is title 'greys anatomy' original Netflix cont...,Answer No,13,"Is Netflix title 'greys anatomy' a Series, Mov...",Answer Series
2,gogglebox,16,Is title 'gogglebox' original Netflix content?,Answer No,9,"Is Netflix title 'gogglebox' a Series, Movie, ...",Answer Series
3,heartland 2007,16,Is title 'heartland 2007' original Netflix con...,Answer No,14,"Is Netflix title 'heartland 2007' a Series, Mo...",Answer Series
4,its always sunny in philadelphia,15,Is title 'its always sunny in philadelphia' or...,Answer No,32,Is Netflix title 'its always sunny in philadel...,Answer Series


In [66]:
combined_df['is_original'].value_counts()

is_original
Answer No                                                                                                                                                           9566
Answer: No                                                                                                                                                          2528
Answer Yes                                                                                                                                                          2058
Answer: Yes                                                                                                                                                          399
Answer: It is unclear without more specific information.                                                                                                               5
Answer: It is unclear as there are multiple titles with similar names.                                                                         

In [67]:
#Use function to clean col
combined_df['is_original'] = combined_df['is_original'].apply(check_string_yes_no)

#View counts
combined_df['is_original'].value_counts()

is_original
No     12111
Yes     2457
          40
Name: count, dtype: int64

In [68]:

#View ratio
combined_df['is_original'].value_counts(normalize=True)

is_original
No     0.829066
Yes    0.168196
       0.002738
Name: proportion, dtype: float64

## 4B. CLEAN - content_type

- Should only include Series, Movie, or Documentary

In [72]:
def check_media_type(s):
    lower_s = s.lower()  # Convert string to lowercase for case-insensitive comparison
    if "series" in lower_s:
        return "Series"
    elif "movie" in lower_s:
        return "Movie"
    elif "documentary" in lower_s:
        return "Documentary"
    else:
        return ""


In [73]:
#BEFORE CLEANING
combined_df['content_type'].value_counts()

content_type
Answer Movie                                                                                                                6962
Answer Series                                                                                                               5546
Answer Documentary                                                                                                           674
Answer: Series                                                                                                               249
Answer: Movie                                                                                                                199
                                                                                                                            ... 
Answer: It could be any of the three, as there is no specific title on Netflix called "shanty town."                           1
Answer: It could refer to either a movie or a series, as there are multiple titles w

In [74]:
#AFTER CLEANING

#Use function to clean col
combined_df['content_type'] = combined_df['content_type'].apply(check_media_type)

#View counts
combined_df['content_type'].value_counts()

content_type
Movie          7184
Series         6025
Documentary     708
                691
Name: count, dtype: int64

In [75]:
#View ratio
combined_df['content_type'].value_counts(normalize=True)

content_type
Movie          0.491785
Series         0.412445
Documentary    0.048467
               0.047303
Name: proportion, dtype: float64

## 4C. ADD COLUMN API_DATA_ID

This helps with id which engagement record merged with the api data record.

In [76]:
def add_api_data_id_column_first(df):
    """
    Adds a new column 'api_data_id' as the first column to the dataframe based on the specified format.

    Args:
    df (pd.DataFrame): The input dataframe with 'is_original' and 'content_type' columns.

    Returns:
    pd.DataFrame: The dataframe with the added 'api_data_id' column at the first position.
    """
    # Generate the 'api_data_id' column and insert it at the first position
    api_data_id = df.apply(lambda row: f"api-{row.name}-{row['is_original']}-{row['content_type']}", axis=1)
    df.insert(0, 'api_data_id', api_data_id)  # Inserting at the first position
    return df



In [77]:
combined_df.head()

Unnamed: 0,Clean_Title,Title_Counts,Clean_Title_Prompt,is_original,Clean_Title_Char_Len,Content_Type_Prompt,content_type
0,naruto shippuden,21,Is title 'naruto shippuden' original Netflix c...,No,16,"Is Netflix title 'naruto shippuden' a Series, ...",Series
1,greys anatomy,19,Is title 'greys anatomy' original Netflix cont...,No,13,"Is Netflix title 'greys anatomy' a Series, Mov...",Series
2,gogglebox,16,Is title 'gogglebox' original Netflix content?,No,9,"Is Netflix title 'gogglebox' a Series, Movie, ...",Series
3,heartland 2007,16,Is title 'heartland 2007' original Netflix con...,No,14,"Is Netflix title 'heartland 2007' a Series, Mo...",Series
4,its always sunny in philadelphia,15,Is title 'its always sunny in philadelphia' or...,No,32,Is Netflix title 'its always sunny in philadel...,Series


In [78]:
#Use function to create column 'api_data_id' 
combined_df = add_api_data_id_column_first(combined_df)

print(combined_df.shape)

combined_df

(14608, 8)


Unnamed: 0,api_data_id,Clean_Title,Title_Counts,Clean_Title_Prompt,is_original,Clean_Title_Char_Len,Content_Type_Prompt,content_type
0,api-0-No-Series,naruto shippuden,21,Is title 'naruto shippuden' original Netflix c...,No,16,"Is Netflix title 'naruto shippuden' a Series, ...",Series
1,api-1-No-Series,greys anatomy,19,Is title 'greys anatomy' original Netflix cont...,No,13,"Is Netflix title 'greys anatomy' a Series, Mov...",Series
2,api-2-No-Series,gogglebox,16,Is title 'gogglebox' original Netflix content?,No,9,"Is Netflix title 'gogglebox' a Series, Movie, ...",Series
3,api-3-No-Series,heartland 2007,16,Is title 'heartland 2007' original Netflix con...,No,14,"Is Netflix title 'heartland 2007' a Series, Mo...",Series
4,api-4-No-Series,its always sunny in philadelphia,15,Is title 'its always sunny in philadelphia' or...,No,32,Is Netflix title 'its always sunny in philadel...,Series
...,...,...,...,...,...,...,...,...
14603,api-14603-No-Movie,rage,1,Is title 'rage' original Netflix content?,No,4,"Is Netflix title 'rage' a Series, Movie, or Do...",Movie
14604,api-14604-No-Movie,jessabelle,1,Is title 'jessabelle' original Netflix content?,No,10,"Is Netflix title 'jessabelle' a Series, Movie,...",Movie
14605,api-14605-No-Movie,mnnerhort,1,Is title 'mnnerhort' original Netflix content?,No,9,"Is Netflix title 'mnnerhort' a Series, Movie, ...",Movie
14606,api-14606-No-Movie,vadh,1,Is title 'vadh' original Netflix content?,No,4,"Is Netflix title 'vadh' a Series, Movie, or Do...",Movie


In [79]:
#Save to csv
combined_df.to_csv('08 - FORMATTED_ENHANCED_DATA - Netflix_Engagement_Date.csv', index=False)

<class 'str'>


'You are a manager agent. You direct the actions of the researcher agent. You are responsible for creating plans and strategies, coordinating activities, and compiling the final response.'