In [1]:
import openai
import pandas as pd
import json

In [2]:
# Load api Key
def get_credentials() -> dict:
    # loads openai credentials from local config.json file
    with open('config.json', 'r') as f:
        config = json.load(f)
    return config

In [None]:
config = get_credentials()
openai.api_key = config['open_api_key']
print(openai.api_key)

In [4]:
# Constants
MODEL = "gpt-4"  # or "gpt-3.5-turbo"
MOVIE_DATA = "df_top_20_movies_customers_reviewed_all.csv"
DATASET_PATH = "./data"

TRAINING_MOVIES = ['American Beauty',
'The Wedding Planner',
'Man on Fire',
'S.W.A.T.',
'Pirates of the Caribbean: The Curse of the Black Pearl',
'50 First Dates',
'What Women Want',
'The Bourne Supremacy',
'Lord of the Rings: The Fellowship of the Ring',
'Braveheart']

TEST_MOVIES = ['The Silence of the Lambs',
'Ghost',
'Men in Black II',
'The Last Samurai',
'Bruce Almighty',
'Shrek 2',
'Finding Nemo (Widescreen)',
'Patch Adams',
'The Sixth Sense',
'The Italian Job']


## Helper Functions

In [None]:
def get_customer_movies(df, customer):
    # Just return the 20 movies that the single customer has reviewed
    return df[df['Cust_Id'] == customer]

In [None]:
def find_eligible_picks(customer_df):
    """
    Our Movie ratings aren't very distinct. The customer is rating from 1-5. 
    Therefore, we will often have multiple "correct" answers in the test set of 10 movies. 

    Lets say the customer has rated 3 movies 5 stars.
    If we recommend any of those 3 as our top recommendation system, we will have a correct answer.
    
    Parameters:
    customer_df (pd.DataFrame): A DataFrame containing customer movie data. The DataFrame should include the following columns:
                                - 'Name': The name of the movie.
                                - 'Rating': The rating given by the customer.

    Returns:
    np.ndarray: An array of movie names that have the highest rating in the test set.
    """
    test_movies_df = customer_df[customer_df['Name'].isin(TEST_MOVIES)]
    test_movies_df.sort_values(by='Rating', ascending=False, inplace=True)

    # First, select the highest movie rating from the test set
    movie_to_recommend = test_movies_df.iloc[0]

    # next, select other correct answers that had the same rating. as there could be multiple "favourites"
    eligible_picks = test_movies_df[test_movies_df['Rating'] == movie_to_recommend['Rating']]['Name'].values
    return eligible_picks

In [None]:
def compose_movie_prompt_jsons(customer_df: str) -> str:
    """
    Processes a DataFrame containing customer movie data, categorizes the movies into training and test sets based on predefined criteria, 
    and converts these sets into JSON formatted strings.

    Parameters:
    customer_df (str): A string representing the DataFrame containing customer movie data. 
                       The DataFrame should include the following columns:
                       - 'Name': The name of the movie.
                       - 'Movie_Year': The release year of the movie.
                       - 'Average_Rating': The average rating of the movie.
                       - 'Review_Count': The total number of reviews for the movie.
                       - 'Rating': The rating given by the customer.
                       - 'Date': The date the rating was given.

    Returns:
    tuple: A tuple containing:
        - training_json (str): A JSON formatted string containing the training movies data.
        - test_json (str): A JSON formatted string containing the test movies data.
    """
    json_training_list = []
    json_test_list = []

    for _, row in customer_df.iterrows():
        movie_dict = {
            "Movie Name": row['Name'],
            "metadata": {
                "Movie Release Date": row['Movie_Year'],
                "Average Moview rating": row['Average_Rating'],
                "Total Movie Reviews": row['Review_Count'],
            }
        }
        if row['Name'] in TRAINING_MOVIES:
            movie_dict['metadata']['Customer Rating'] = row['Rating']
            movie_dict['metadata']['Date Rated'] = row['Date']
            json_training_list.append(movie_dict)
        else:
            json_test_list.append(movie_dict)

    # Convert the list of dictionaries to JSON format
    training_json = json.dumps(json_training_list, indent=4)

    # Convert the list of dictionaries to JSON format
    test_json = json.dumps(json_test_list, indent=4)

    return training_json, test_json

In [None]:
def create_prompt(training_json, test_json):
    prompt = f"""Help me pick the best movie to watch. 
    Here are some movies I liked with information about them. 
    Pay attention to my rating of these movies in field, Customer Rating: 
    {training_json}

    Based on my ratings, here are some movies I am considering watching next. Help me pick the best one:
    {test_json}

    I don't want any commentary. Only output the recommended movie name in json format. 'Movie Name': 'Movie Title'
    
    """
    return prompt 

In [None]:
def get_chatgpt_recommendation(prompt):
    response = openai.ChatCompletion.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a movie recommendation assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150
    )
    return response.choices[0].message['content'].strip()

#### Main flow

In [None]:
def evaluate_for_customers(df, customers):
    correct, incorrect = 0

    for customer in customers:
        # Find the movies that a single customer has rated
        single_customer_df = get_customer_movies(df, customer)
        # Find the eligible picks for the customer. We will use these to evaluate LLM recommendations as correct
        eligible_picks = find_eligible_picks(single_customer_df)
        # Convert the customer data to JSON format
        training_json, test_json = compose_movie_prompt_jsons(single_customer_df)
        # Create the prompt
        prompt = create_prompt(training_json, test_json)
        print(prompt)

        llm_pick = get_chatgpt_recommendation(prompt)

        if llm_pick in eligible_picks:
            print(f"Correct recommendation: {llm_pick} for customer {customer}")
            correct += 1
        else:
            print(f"incorrect recommendation: {llm_pick} for customer {customer}")
            incorrect += 1
    return correct, incorrect
        

### Setup

In [5]:
df = pd.read_csv(f"{DATASET_PATH}/{MOVIE_DATA}")

In [31]:
df.head(3)

Unnamed: 0,Movie_Id,Cust_Id,Rating,Date,Movie_Year,Name,Average_Rating,Review_Count
0,571,1844276,5.0,2002-03-05,1999,American Beauty,3.962585,154832
1,571,2422606,1.0,2001-11-20,1999,American Beauty,3.962585,154832
2,571,1515501,3.0,2002-11-25,1999,American Beauty,3.962585,154832


## Manual Proof of Concept before we automate
need to check API costs, and structure experiment, etc

In [57]:
# Find test customer
test_customer = df['Cust_Id'].values[0]
print(test_customer)
customers = [test_customer]

1844276


In [None]:
correct, incorrect = evaluate_for_customers(df, customers)

### Evaluate for all customers
Need to fix bugs in single evaluation first

In [None]:

# customers = df['Cust_Id'].unique()
# correct, incorrect = evaluate_for_customers(df, customers)