In [131]:
from openai import OpenAI
import pandas as pd
import json
import time

In [132]:
# Load api Key
def get_credentials() -> dict:
    # loads openai credentials from local config.json file
    with open('config.json', 'r') as f:
        config = json.load(f)
    return config

In [133]:
config = get_credentials()

In [134]:

client = OpenAI(
  api_key=config['open_api_key'],  # this is also the default, it can be omitted
)

In [135]:
# Constants
MODEL = "gpt-3.5-turbo"  # or "gpt-3.5-turbo"
# MODEL = "gpt-4-turbo"  # or "gpt-3.5-turbo"
MOVIE_DATA = "df_top_20_movies_customers_reviewed_all.csv"
DATASET_PATH = "./data"

TRAINING_MOVIES = ['American Beauty',
'The Wedding Planner',
'Man on Fire',
'S.W.A.T.',
'Pirates of the Caribbean: The Curse of the Black Pearl',
'50 First Dates',
'What Women Want',
'The Bourne Supremacy',
'Lord of the Rings: The Fellowship of the Ring',
'Braveheart']

TEST_MOVIES = ['The Silence of the Lambs',
'Ghost',
'Men in Black II',
'The Last Samurai',
'Bruce Almighty',
'Shrek 2',
'Finding Nemo (Widescreen)',
'Patch Adams',
'The Sixth Sense',
'The Italian Job']


## Helper Functions

In [136]:
def get_customer_movies(df, customer):
    # Just return the 20 movies that the single customer has reviewed
    return df[df['Cust_Id'] == customer]

In [137]:
def find_eligible_picks(customer_df):
    """
    Our Movie ratings aren't very distinct. The customer is rating from 1-5. 
    Therefore, we will often have multiple "correct" answers in the test set of 10 movies. 

    Lets say the customer has rated 3 movies 5 stars.
    If we recommend any of those 3 as our top recommendation system, we will have a correct answer.
    
    Parameters:
    customer_df (pd.DataFrame): A DataFrame containing customer movie data. The DataFrame should include the following columns:
                                - 'Name': The name of the movie.
                                - 'Rating': The rating given by the customer.

    Returns:
    np.ndarray: An array of movie names that have the highest rating in the test set.
    """
    test_movies_df = customer_df[customer_df['Name'].isin(TEST_MOVIES)]
    test_movies_df.sort_values(by='Rating', ascending=False, inplace=True)

    # First, select the highest movie rating from the test set
    movie_to_recommend = test_movies_df.iloc[0]

    # next, select other correct answers that had the same rating. as there could be multiple "favourites"
    eligible_picks = test_movies_df[test_movies_df['Rating'] == movie_to_recommend['Rating']]['Name'].values
    return eligible_picks

In [150]:
def compose_movie_prompt_jsons(customer_df: str) -> str:
    """
    Processes a DataFrame containing customer movie data, categorizes the movies into training and test sets based on predefined criteria, 
    and converts these sets into JSON formatted strings.

    Parameters:
    customer_df (str): A string representing the DataFrame containing customer movie data. 
                       The DataFrame should include the following columns:
                       - 'Name': The name of the movie.
                       - 'Movie_Year': The release year of the movie.
                       - 'Average_Rating': The average rating of the movie.
                       - 'Review_Count': The total number of reviews for the movie.
                       - 'Rating': The rating given by the customer.
                       - 'Date': The date the rating was given.

    Returns:
    tuple: A tuple containing:
        - training_json (str): A JSON formatted string containing the training movies data.
        - test_json (str): A JSON formatted string containing the test movies data.
    """
    json_training_list = []
    json_test_list = []

    for _, row in customer_df.iterrows():
        movie_dict = {
            "Movie Name": row['Name'],
            "Release Date": row['Movie_Year'],
            "Average rating": row['Average_Rating'],
            "Total Reviews": row['Review_Count'],
        }
        if row['Name'] in TRAINING_MOVIES:
            movie_dict['Customer Rating'] = row['Rating']
            movie_dict['Date Reviewed'] = row['Date']
            json_training_list.append(movie_dict)
        else:
            json_test_list.append(movie_dict)

    # Convert the list of dictionaries to JSON format
    training_json = json.dumps(json_training_list, indent=4)

    # Convert the list of dictionaries to JSON format
    test_json = json.dumps(json_test_list, indent=4)

    return training_json, test_json

In [154]:
def create_prompt(training_json, test_json):
    prompt = f"""Here are some movies I liked with information about them. 
    Pay attention to the date reviewed, as that combined with its rating indicates what I currently like:
    {training_json}

    Based on my ratings, recommend the movie I will like the most to watch next. Only output the recommended movie name in JSON format with key Movie Name. For example 'Movie Name': 'Movie Title'.
    {test_json}
    """
    return prompt 

In [140]:
def get_chatgpt_recommendation(prompt):
    completion = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a movie recommendation assistant."},
            {"role": "user", "content": prompt}
        ],
        response_format= { "type": "json_object" },
        max_tokens=100,
    )

    try:
        response_json = json.loads(completion.model_dump_json(indent=2))
        content_json = json.loads(response_json['choices'][0]['message']['content'])
        print(content_json)
        if 'Movie Name' in content_json:
            return content_json['Movie Name']
        else:
            return None
    except Exception as e:
        return None


#### Main flow

In [148]:
def evaluate_for_customers(df, customers):
    correct, incorrect = 0,0
    failed_customers = []
    for i, customer in enumerate(customers):
        # Find the movies that a single customer has rated
        single_customer_df = get_customer_movies(df, customer)
        # Find the eligible picks for the customer. We will use these to evaluate LLM recommendations as correct
        eligible_picks = find_eligible_picks(single_customer_df)
        # Convert the customer data to JSON format
        training_json, test_json = compose_movie_prompt_jsons(single_customer_df)
        # Create the prompt
        prompt = create_prompt(training_json, test_json)

        llm_pick = get_chatgpt_recommendation(prompt)
        print(f"{i} LLM recommendation: {llm_pick}")
        if llm_pick in eligible_picks:
            print(f"Correct recommendation: {llm_pick} for customer {customer}")
            correct += 1
        else:
            if llm_pick is None:
                print(f"FAILED due to API issues for customer {customer}")
                failed_customers.append(customer)
            else:
                print(f"incorrect recommendation: {llm_pick} for customer {customer}, expected {eligible_picks}")
                incorrect += 1
        # time.sleep(0.1)
    return correct, incorrect, failed_customers
        

### Setup

In [142]:
df = pd.read_csv(f"{DATASET_PATH}/{MOVIE_DATA}")

In [143]:
df.head(3)

Unnamed: 0,Movie_Id,Cust_Id,Rating,Date,Movie_Year,Name,Average_Rating,Review_Count
0,571,1844276,5.0,2002-03-05,1999,American Beauty,3.962585,154832
1,571,2422606,1.0,2001-11-20,1999,American Beauty,3.962585,154832
2,571,1515501,3.0,2002-11-25,1999,American Beauty,3.962585,154832


In [144]:
# cleanup average rating column to 2 decimal places
df['Average_Rating'] = df['Average_Rating'].apply(lambda x: round(x, 2))

# convert rating to int
df['Rating'] = df['Rating'].astype(int)

# Model Evaluation

### Evaluate for 100 customers
#### with model gpt-3.5-turbo

In [162]:
MODEL = "gpt-3.5-turbo"
test_customer = df['Cust_Id'].unique()[0:100].tolist()
print(len(test_customer))
customers = test_customer

100


In [163]:
print(customers)

[330424, 493190, 1341247, 1148217, 1103030, 876510, 524619, 316492, 1968862, 632333, 2355625, 1743756, 36104, 136058, 1001912, 2290578, 327006, 146194, 751829, 1474958, 2453264, 21597, 975592, 1077173, 521562, 2556923, 1821560, 1359953, 1558286, 362788, 2042548, 1792238, 1505732, 471499, 2187083, 110938, 1040475, 2519096, 2061245, 1351064, 1102887, 1098510, 884955, 2015233, 51451, 813213, 2275159, 115937, 2256731, 1569224, 2444094, 320362, 2049525, 840763, 860566, 1633340, 168263, 787331, 220010, 2497800, 2190053, 2301810, 704069, 2055340, 138426, 505171, 519891, 2514777, 796157, 2624159, 1636738, 1303348, 399713, 561364, 988454, 125219, 141603, 965050, 2086206, 1005032, 1257537, 968184, 2050529, 920168, 1825647, 1178846, 1851429, 1941478, 1950890, 2496629, 477659, 279966, 881356, 2158880, 150732, 1565552, 2166500, 1006868, 1908699, 1358164]


In [166]:
correct, incorrect, failed_customers = evaluate_for_customers(df, customers)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_movies_df.sort_values(by='Rating', ascending=False, inplace=True)


{'Movie Name': 'The Silence of the Lambs'}
0 LLM recommendation: The Silence of the Lambs
Correct recommendation: The Silence of the Lambs for customer 330424
{'Movie Name': 'The Silence of the Lambs'}
1 LLM recommendation: The Silence of the Lambs
Correct recommendation: The Silence of the Lambs for customer 493190
{'Movie Name': 'The Sixth Sense'}
2 LLM recommendation: The Sixth Sense
incorrect recommendation: The Sixth Sense for customer 1341247, expected ['The Silence of the Lambs' 'Finding Nemo (Widescreen)' 'The Italian Job']
{'Movie Name': 'The Silence of the Lambs'}
3 LLM recommendation: The Silence of the Lambs
incorrect recommendation: The Silence of the Lambs for customer 1148217, expected ['Ghost' 'The Last Samurai' 'Shrek 2' 'Finding Nemo (Widescreen)'
 'Patch Adams' 'The Sixth Sense']
{'Movie Name': 'The Sixth Sense'}
4 LLM recommendation: The Sixth Sense
incorrect recommendation: The Sixth Sense for customer 1103030, expected ['The Silence of the Lambs' 'Shrek 2' 'Findin

In [167]:
print(f"Correct recommendations: {correct}")
print(f"Incorrect recommendations: {incorrect}")
print(f"Accuracy rate: {correct/(len(customers) - len(failed_customers)):.2f}")
print(f"failed customers due to API issues: {failed_customers}")

Correct recommendations: 68
Incorrect recommendations: 32
Accuracy rate: 0.68
failed customers due to API issues: []


### Evaluate for 100 customers
#### with model gpt-4-turbo

In [None]:
MODEL = "gpt-4-turbo"
test_customer = df['Cust_Id'].unique()[0:100].tolist()
print(len(test_customer))
customers = test_customer

In [None]:
correct, incorrect, failed_customers = evaluate_for_customers(df, customers)

In [92]:
print(f"Correct recommendations: {correct}")
print(f"Incorrect recommendations: {incorrect}")
print(f"Accuracy rate: {correct/(len(customers) - len(failed_customers)):.2f}")
print(f"failed customers due to API issues: {failed_customers}")

Correct recommendations: 56
Incorrect recommendations: 43
 accuracy rate: 0.57
failed customers due to API issues: [845918]


### Evaluate for 1k customers
#### with model gpt-3.5-turbo

In [125]:
test_customer = df['Cust_Id'].unique()[100:1100].tolist()
print(len(test_customer))
customers = test_customer

1000


In [126]:
print(customers)

[2382575, 1164917, 1310250, 425507, 1962061, 2012897, 1118222, 44734, 1265877, 1603214, 769887, 1417435, 845136, 1986819, 273630, 2152097, 1365904, 360830, 164532, 2192892, 279482, 2645579, 2060373, 341961, 431806, 918897, 1820275, 381326, 1989046, 1481142, 2299825, 733453, 1094756, 2285713, 1565948, 2438169, 1910699, 250927, 575786, 2230092, 1998055, 563381, 2144806, 1672251, 1838730, 267534, 754349, 1771516, 2295117, 1094786, 2525544, 1705134, 201303, 1814123, 707630, 2631649, 2123669, 1623526, 2320715, 2025949, 491329, 2577867, 2084043, 2402990, 2125035, 348646, 676278, 2264352, 1245640, 1495252, 1146375, 917952, 2383405, 1771587, 1229269, 1707469, 1732550, 1713873, 275530, 1714541, 1641247, 1362880, 995190, 94912, 2268573, 1978844, 1037313, 800136, 2622904, 756735, 2035994, 2087711, 603908, 326121, 2602480, 1488926, 616126, 2440392, 303948, 409239, 2600558, 2090015, 1978934, 204318, 369451, 743752, 20040, 1286998, 455962, 1492122, 1193801, 1206843, 204011, 2594837, 1963313, 125296,

In [127]:
correct, incorrect, failed_customers = evaluate_for_customers(df, customers)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_movies_df.sort_values(by='Rating', ascending=False, inplace=True)


{'Movie Name': 'The Silence of the Lambs'}
0 LLM recommendation: The Silence of the Lambs
Correct recommendation: The Silence of the Lambs for customer 2382575
{'Movie Name': 'The Silence of the Lambs'}
1 LLM recommendation: The Silence of the Lambs
Correct recommendation: The Silence of the Lambs for customer 1164917
{'Movie Name': 'The Sixth Sense'}
2 LLM recommendation: The Sixth Sense
incorrect recommendation: The Sixth Sense for customer 1310250, expected ['The Last Samurai']
{'Movie Name': 'Men in Black II'}
3 LLM recommendation: Men in Black II
incorrect recommendation: Men in Black II for customer 425507, expected ['The Silence of the Lambs' 'Ghost' 'Patch Adams' 'The Sixth Sense']
{'Movie Name': 'The Sixth Sense'}
4 LLM recommendation: The Sixth Sense
incorrect recommendation: The Sixth Sense for customer 1962061, expected ['The Last Samurai']
{'Movie Name': 'The Silence of the Lambs'}
5 LLM recommendation: The Silence of the Lambs
Correct recommendation: The Silence of the La

In [128]:
print(f"Correct recommendations: {correct}")
print(f"Incorrect recommendations: {incorrect}")
print(f"Accuracy rate: {correct/(len(customers) - len(failed_customers)):.2f}")
print(f"failed customers due to API issues: {failed_customers}")

Correct recommendations: 597
Incorrect recommendations: 403
Accuracy rate: 0.60
failed customers due to API issues: []
