In [3]:
# general
import numpy as np
import pandas as pd
import mysql.connector
import json

# ML
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

# initalising dataset
import requests as req
import gzip
import concurrent.futures
from webpage.routes.tmdb_calls import doBatch
from multiprocessing import Manager
from io import BytesIO
import mysql.connector
from sqlalchemy import create_engine
from langdetect import detect


# ensure no duplicate cast members
def remove_duplicates(names):
    if isinstance(names, str):
        unique_names = set(name.strip() for name in names.split(','))
        return ', '.join(unique_names)
    else:
        return names

# langdetec check if film title is in english
def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False
    
#export film data to mysql
def save_mySQL(data):

    # MySQL connection configuration
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    # Cursor object to execute SQL queries
    mycursor = mydb.cursor()

    # Table name in the database
    table_name = "all_films"

    # Define the SQL query to delete all records from the table
    delete_query = "DELETE FROM {}".format(table_name)

    # Execute the delete query
    mycursor.execute(delete_query)
    mydb.commit()

    engine = create_engine("mysql+mysqlconnector://root:Leicester69lol@localhost/users")

    data.to_sql('all_films', con=engine, if_exists='replace', index=False)

# download, process and filter IMDB non-commercial dataset, save to mysql db
def INITIALISE_FILM_DATASET():

    print('Downloading tables...')

    #get film datasets ~ 10-30mins

    #set urls
    url_title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz' #film name, year, runtime, genres
    url_crew = 'https://datasets.imdbws.com/title.principals.tsv.gz' #actors, actresses, cinematographers, directors (redundant)
    url_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz' #ratings for films (not all)
    url_names = 'https://datasets.imdbws.com/name.basics.tsv.gz' #link table for names against nconst
    url_langs = 'https://datasets.imdbws.com/title.akas.tsv.gz' #link table for names against nconst

    #download from url
    res_title_basics = req.get(url_title_basics).content
    res_crew = req.get(url_crew).content
    res_ratings = req.get(url_ratings).content
    res_names = req.get(url_names).content
    res_lang = req.get(url_langs).content

    #decompress
    title_basics_gzip = gzip.decompress(res_title_basics)
    crew_basics_gzip = gzip.decompress(res_crew)
    title_ratings_gzip = gzip.decompress(res_ratings)
    names_gzip = gzip.decompress(res_names)
    title_langs_gzip = gzip.decompress(res_lang)

    #read csv into dataframes
    titles = pd.read_csv(BytesIO(title_basics_gzip), delimiter='\t',low_memory=False)
    crew = pd.read_csv(BytesIO(crew_basics_gzip), delimiter='\t',low_memory=False)
    ratings = pd.read_csv(BytesIO(title_ratings_gzip), delimiter='\t',low_memory=False)
    names = pd.read_csv(BytesIO(names_gzip), delimiter='\t',low_memory=False)
    langs = pd.read_csv(BytesIO(title_langs_gzip), delimiter='\t',low_memory=False)


    print('Cleaning data...')

    #first data clean

    # #filter only english films
    desired_langs = ['en']
    filtered_langs = langs[langs['language'].isin(desired_langs)]
    tconsts_filtered_langs = filtered_langs['titleId'].tolist()
    desired_regions = ['CA', 'US', 'GB', 'IE', 'AU', 'NZ']
    filtered_regions = langs[langs['region'].isin(desired_regions)]
    tconsts_filtered_regions = filtered_regions['titleId'].tolist()

    #remove unsuitable titles
    titles = titles[titles['titleType'] == 'movie']
    titles = titles[titles['genres'] != r'\N']
    titles['isAdult'] = pd.to_numeric(titles['isAdult'], errors='coerce')
    titles = titles[titles['isAdult'] == 0 ]
    titles = titles[(titles['startYear'] >= '1955') & (titles['startYear'] != '\\N')]
    titles = titles[(titles['tconst'].isin(tconsts_filtered_langs) & (titles['tconst'].isin(tconsts_filtered_regions)))]

    #get tconsts for remaining non-film rows, and remove corresponding non-film rows
    film_tconsts = titles['tconst'].tolist()
    crew = crew[crew['tconst'].isin(film_tconsts)]
    ratings = ratings[ratings['tconst'].isin(film_tconsts)]

    #set columns to remove from dataset
    remove_from_titles = ['originalTitle', 'endYear', 'titleType', 'isAdult']
    remove_from_crew = ['ordering','job','characters']
    remove_from_ratings = ['numVotes']
    remove_from_names = ['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles']

    #remove unneeded columns
    titles = titles.drop(columns=remove_from_titles)
    crew = crew.drop(columns=remove_from_crew)
    ratings = ratings.drop(columns=remove_from_ratings)
    names = names.drop(columns=remove_from_names)

    print('Merging tables...')

    #merge relational tables

    crew_data = crew.copy()

    #merge crew data with names table to get respective names rather than nconst
    crew_data['nconst'] = crew_data['nconst'].str.split(', ')
    crew_data = crew_data.explode('nconst')
    crew_data = pd.merge(crew_data, names, on='nconst', how='left')
    crew_data = crew_data.pivot_table(
        index=['tconst'],
        columns=['category'],
        values=['primaryName'],
        aggfunc=lambda x: ', '.join(str(item) for item in x),
    ).reset_index()


    #format and restructure columns before merging
    crew_data.columns = ['_'.join(col).strip() for col in crew_data.columns.values]
    crew_data.columns = [col.replace('primaryName_', '') for col in crew_data.columns]
    crew_data = crew_data.rename(columns={'tconst_': 'tconst'})
    columns_to_keep = ['tconst', 'actor', 'actress', 'cinematographer', 'composer', 'director', 'editor', 'producer', 'writer']
    crew_data = crew_data[columns_to_keep]

    #merge film and cast datasets for one complete table
    film_data = pd.merge(titles, ratings, on='tconst', how='left')
    film_data = pd.merge(film_data, crew_data, on='tconst', how='left')

    print('Further cleaning data...')

    # second data clean, drop data sparse rows

    columns_check = ['director', 'cinematographer', 'editor', 'writer', 'composer', 'producer']
    film_data = film_data[film_data[columns_check].isna().sum(axis=1) == 0] #don't allow films with any missing data
    film_data = film_data.dropna(subset=['actor', 'actress', 'runtimeMinutes', 'averageRating', 'genres'])

    # double-check for null columns
    film_data = film_data[film_data['runtimeMinutes'] != '\\N']
    film_data = film_data[film_data['startYear'] != '\\N']
    film_data = film_data[film_data['averageRating'] != '\\N']

    # combine actor and actress into 1 column ~ 10 cast member (can reduce)
    film_data['cast'] = film_data['actor'] + ', ' + film_data['actress']
    film_data.drop(['actor', 'actress'], axis=1, inplace=True)
    film_data['cast'] = film_data['cast'].apply(remove_duplicates) # double-check for duplicate cast members from merging

    # remove data-sparse films
    print('EDL testing...')

    # check titles are in english (filter out MFL films release in West or mis-labelled)
    english_titles = film_data['primaryTitle'].apply(is_english)
    film_data = film_data[english_titles]

    #add columns for plot and poster path
    film_data['plot'] = np.nan
    film_data['poster'] = np.nan


    print('Films: ' + str(len(film_data)))


    print('Fetching plot summaries and posters...')

    # get film plot and poster with tmdb api ~ inconsistent runtime (<2Hrs)

    #call api/details for each film with multiprocessing and mutlithreading
    if __name__ == '__main__':

        manager = Manager()
        shared_data = manager.Namespace() #allow data to be shared with external function
        agg_list = []

        batch_size = 1000
        sleep_time = 3

        num_batches = (len(film_data) // batch_size) + 1

        with concurrent.futures.ProcessPoolExecutor(8) as process_executor:

            for i in range(num_batches):

                start_index = i * batch_size
                end_index = (i + 1) * batch_size
                
                shared_data.film_data = film_data.iloc[start_index:end_index]

                future = process_executor.submit(doBatch, shared_data)

                concurrent.futures.wait([future])

                agg_list.append(shared_data.film_data)

                print(f"Batch {i+1}/{num_batches} completed")
                    
        film_data = pd.concat(agg_list, ignore_index=True)

    # #remove films with no plot
    film_data = film_data.dropna(subset=['plot', 'poster'])

    final_order = ['tconst','primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes', 'startYear', 'cast', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer', 'poster']
    film_data = film_data[final_order]

    print('Exporting to sql...')

    #shuffle order
    film_data = film_data.sample(frac=1)

    # export film data to sql db
    save_mySQL(film_data)


    print('Exporting to json...')

    film_data.to_json('webpage/films.json' ,orient="records")

    print('Films saved to database!')

# load whole films dataset from db
def loadAllFilms():

    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    mycursor = mydb.cursor()

    sql_query = "SELECT * FROM all_films"

    mycursor.execute(sql_query)
    
    columns = [col[0] for col in mycursor.description]

    films = mycursor.fetchall()

    mycursor.close()
    mydb.close()

    films_data = pd.DataFrame(films, columns=columns)

    return films_data

# join and concatenate inputted columns
def create_soup(x, features):
    soup_parts = [str(x[feature]) for feature in features if x[feature] is not None]  # Convert to string and filter out None values
    return ' '.join(soup_parts)

# count number of likeable elements for film
def count_likeable(row):
    features = ['primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes', 'startYear', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer']
    atts = sum(1 for col in row[features] if col is not None)
    cast = len(row['cast'].split(','))
    return atts+cast

# get user loved films from db
def get_loved_films(user_id):

    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    mycursor = mydb.cursor()

    sql_query = "SELECT tconst FROM user_loved_films WHERE user_id = %s"

    mycursor.execute(sql_query, (user_id,))

    rows = mycursor.fetchall()

    tconst_list = [tconst[0] for tconst in rows]

    mycursor.close()
    mydb.close()

    loved_films_df = data[data['tconst'].isin(tconst_list)]

    return loved_films_df

# get user liked attributes from db
def get_liked_attributes(user_id):

    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    mycursor = mydb.cursor()

    sql_query = "SELECT * FROM user_liked_attributes WHERE user_id = %s"

    mycursor.execute(sql_query, (user_id,))

    attribute_fetch = mycursor.fetchall()

    tconst_list = [row[1] for row in attribute_fetch]
    attribute_bin = [row[1:] for row in attribute_fetch]

    mycursor.close()
    mydb.close()

    attributes_template = ['tconst','primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes', 'startYear', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer']
    liked_attributes_df = pd.DataFrame(columns=attributes_template)

    x = 0
    for tconst, att_bin in zip(tconst_list, attribute_bin):
        row_values = [tconst]  # Initialize row values with tconst
        for liked, attribute in zip(att_bin[1:], attributes_template[1:]):
            film_att = data[data['tconst'] == tconst]

            if liked == 1:
                row_values.append(film_att[attribute].values[0])
            else:
                row_values.append(None) 
        liked_attributes_df.loc[x] = row_values
        x+=1

    return liked_attributes_df

# get user liked cast from db
def get_liked_cast(user_id):
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    mycursor = mydb.cursor()

    sql_query = "SELECT * FROM user_liked_cast WHERE user_id = %s"

    mycursor.execute(sql_query, (user_id,))

    cast_fetch = mycursor.fetchall()

    mycursor.close()
    mydb.close()

    cast = [row[1:] for row in cast_fetch]
    liked_cast_df = pd.DataFrame(cast, columns=['tconst', 'name'])

    return liked_cast_df

# get user watchlist from db
def get_watchlist(user_id):
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    mycursor = mydb.cursor()

    sql_query = "SELECT * FROM user_watchlist WHERE user_id = %s"

    mycursor.execute(sql_query, (user_id,))


    watchlist_fetch = mycursor.fetchall()

    mycursor.close()
    mydb.close()

    tconst_list = [row[1] for row in watchlist_fetch]
    attributes_template = ['tconst','primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes', 'startYear', 'cast', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer', 'poster']
    watchlist_df = pd.DataFrame(columns=attributes_template)

    x = 0
    for tconst in tconst_list:
        watchlist_df.loc[x] = data[data['tconst'] == tconst][attributes_template].values[0]
        x+=1

    return watchlist_df

# calculate % of liked attributes of film in profile
def calculate_percentage(row):
    total_attributes = row['total_likeable']
    total_liked = row['num_liked_atts']
    percentage = total_liked / total_attributes
    return percentage 

# count liked attributes for film
def count_liked(row):
    features = ['primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes', 'startYear', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer']
    atts = sum(1 for col in row[features] if pd.notna(col))  # Change this line
    cast = 0
    if not pd.isna(row['cast']):
        cast = len(row['cast'].split(','))
    
    return atts + cast

# CREATE USER PROFILE
def get_user_profile(user_id):
    # get user film preferences from db
    lovedFilms = get_loved_films(user_id)
    likedAtt = get_liked_attributes(user_id)
    likedCast = get_liked_cast(user_id)
    watchlist = get_watchlist(user_id)

    # merge liked cast with liked attribute table
    likedCast_grouped = likedCast.groupby('tconst')['name'].apply(lambda x: ', '.join(x.dropna())).reset_index()
    likedAttributes = pd.merge(likedAtt, likedCast_grouped, on='tconst', how='outer') #outer join instead of left?
    likedAttributes.rename(columns={'name': 'cast'}, inplace=True)

    # format user_profile dataframe
    merged_love_liked = pd.concat([lovedFilms, likedAttributes], ignore_index=True)
    user_profile = pd.merge(merged_love_liked, data[['tconst', 'total_likeable']], on='tconst', how='left')
    user_profile = user_profile.drop(columns=['poster', 'total_likeable_x'])
    user_profile.rename(columns={'total_likeable_y': 'total_likeable'}, inplace=True)

    # initialise columns to calculate likeage
    user_profile['num_liked_atts'] = 0
    user_profile['likeage'] = 0.0
    # count total liked attributes for each film in profile
    for index, film in user_profile.iterrows():
        tconst = film['tconst']
        user_profile.loc[user_profile['tconst'] == tconst, 'num_liked_atts'] = count_liked(film)

    # calculate likeage for each film in profile
    user_profile['likeage'] = user_profile.apply(lambda row: calculate_percentage(row), axis=1)
    user_profile.drop(columns=['num_liked_atts', 'total_likeable'], inplace=True)

    return (user_profile, lovedFilms)

# get array of names from inputted dataframe
def extract_names(data):
    names = set()
    for column in data.columns:
        for value in data[column]:
            if value and isinstance(value, str) and value.lower() not in ['none', 'null', 'nan']:
                names.update(value.split(", "))
    return list(names)

# split user profile into likes by group
def collate_liked_groups(user_profile):
    group_dataframes = []

    # Define dictionary to map group names to corresponding columns
    group_columns = {
        'liked_title_plot': ['primaryTitle', 'plot', 'likeage'],
        'liked_cast': ['cast', 'likeage'],
        'liked_crew': ['director', 'cinematographer', 'writer', 'producer', 'editor', 'composer', 'likeage'],
        'liked_genre': ['genres', 'likeage'],
        'liked_meta': ['averageRating', 'startYear', 'runtimeMinutes', 'likeage']
    }

    # Iterate over the dictionary and create group dataframes
    for group_name, columns in group_columns.items():
        group_df = user_profile[columns].copy()
        # group_df = group_df.dropna(subset=columns[:-1], how='all')
        group_dataframes.append(group_df)

    return group_dataframes

# cosine similarity vector with tf-idf between films in row and column
def create_similarity_vector(row, column):
    tfidf = TfidfVectorizer(stop_words='english')
    row_soup_temp = row.apply(lambda x: create_soup(x, row.columns), axis=1)
    row_soup = row_soup_temp.fillna('')
    row_matrix = tfidf.fit_transform(row_soup)

    column_soup_temp = column.apply(lambda x: create_soup(x, column.columns), axis=1)
    column_soup = column_soup_temp.fillna('')
    column_matrix = tfidf.transform(column_soup)

    return linear_kernel(row_matrix, column_matrix)

# euclidean distance vector for numerical attributes of film
def create_euclidean_vector(row, column):
    scaler = MinMaxScaler()
    row = row.fillna('')
    column = column.fillna(0)

    row_normalized = scaler.fit_transform(row)
    column_normalized = scaler.transform(column)
    distances = euclidean_distances(row_normalized, column_normalized)
    euclidean_matrix = 1 / (1 + distances)
    return euclidean_matrix

# get top N films similar to user profile based on input vector (NOT CURRENTLY IN USE)
def get_similar_films(vector, exclude):
    mean_similarity = np.mean(vector, axis=1)
    sorted_indices = np.argsort(mean_similarity)[::-1]
    top_N = sorted_indices[:]
    sorted_films = data.iloc[top_N]
    filtered_recommendations = sorted_films[~sorted_films['tconst'].isin(exclude['tconst'])]

    return filtered_recommendations

# function to calculate combined recommendations
def get_combined_recommendations(user_profile_groups, similarity_vectors, exclude_films):

    weighted_scores = {}
    
    # scale similarity score in respective vector based on likeage of feature
    for group, attributes in user_profile_groups.items():
        similarity_vector = similarity_vectors[group]
        likeage_array = np.array(list(attributes['likeage'].tolist()))
        weighted_similarity = similarity_vector * likeage_array

        # dictionary of groups and weighted similarity vectors 
        weighted_scores[group] = weighted_similarity

     # combine weighted similarity scores across all groups
    combined_scores = np.sum(list(weighted_scores.values()), axis=0)

    # calculate mean similarity scores
    mean_similarity = np.mean(combined_scores, axis=1)

    # sort the mean similarity scores and retrieve the top N indices
    sorted_indices = np.argsort(mean_similarity)[::-1]

    sorted_films = data.iloc[sorted_indices]

    sorted_films['similarity'] = mean_similarity[sorted_indices] / 5

    filtered_recommendations = sorted_films[~sorted_films['tconst'].isin(exclude_films['tconst'])]

    return filtered_recommendations

# return most recurring names in input data
def most_common_names(df, top_n=10):
    all_names = pd.Series(df.values.flatten())
    split_names = all_names.str.split(',').explode()
    name_counts = split_names.value_counts()
    top_n_names = name_counts.head(top_n)
    return top_n_names.index.tolist()

data = loadAllFilms()
attributes = ['primaryTitle', 'plot', 'averageRating', 'genres', 'runtimeMinutes','cast' ,'startYear', 'director', 'cinematographer', 'writer', 'producer', 'editor', 'composer']
data['total_likeable'] = data.apply(lambda x: count_likeable(x), axis=1)
data['soup'] = data.apply(lambda x: create_soup(x, attributes), axis=1)



In [21]:
# get update user profile and loved films
get_profile = get_user_profile('56')
user_profile = get_profile[0]
lovedFilms = get_profile[1]

# get groups of liked attributes
grouped_likes = collate_liked_groups(user_profile)
liked_plot = grouped_likes[0]
liked_cast = grouped_likes[1]
liked_crew = grouped_likes[2]
liked_genre = grouped_likes[3]
liked_meta = grouped_likes[4]

# Define the list of features excluding 'likeage'
plot_features = [col for col in liked_plot.columns if col != 'likeage']
crew_features = [col for col in liked_crew.columns if col != 'likeage']
cast_features = [col for col in liked_cast.columns if col != 'likeage']
genre_features = [col for col in liked_genre.columns if col != 'likeage']
meta_features = [col for col in liked_meta.columns if col != 'likeage']

# Call the function with the selected columns
plot_matrix = create_similarity_vector(data[plot_features], liked_plot[plot_features])
crew_matrix = create_similarity_vector(data[crew_features], liked_crew[crew_features])
cast_matrix = create_similarity_vector(data[cast_features], liked_cast[cast_features])
genre_matrix = create_similarity_vector(data[genre_features], liked_genre[genre_features])
meta_matrix = create_euclidean_vector(data[meta_features], liked_meta[meta_features])


# Similarity vectors for each group
similarity_vectors = {
    'plot': plot_matrix,
    'cast': cast_matrix,
    'crew': crew_matrix,
    'genre': genre_matrix,
    'meta': meta_matrix
}

user_profile_groups = {
    'plot': liked_plot,
    'cast': liked_cast,
    'crew': liked_crew,
    'genre': liked_genre,
    'meta': liked_meta
}


get_combined_recommendations(user_profile_groups, similarity_vectors, user_profile)

Unnamed: 0,tconst,primaryTitle,plot,averageRating,genres,runtimeMinutes,startYear,cast,director,cinematographer,writer,producer,editor,composer,poster,total_likeable,soup,similarity
10874,tt1528100,Exodus: Gods and Kings,The defiant leader Moses rises up against the ...,6.0,"Action,Adventure,Drama",150,2014,"Ben Mendelsohn, Sigourney Weaver, Isaac Andrew...",Ridley Scott,Dariusz Wolski,"Adam Cooper, Bill Collage, Jeffrey Caine, Stev...","Peter Chernin, Mark Huffam, Michael Schaefer, ...",Billy Rich,Alberto Iglesias,/uaDj37JtvLan9tihxZ18e6qL33b.jpg,22,Exodus: Gods and Kings The defiant leader Mose...,0.259233
3311,tt1894476,How I Live Now,"An American girl, sent to the English countrys...",6.4,"Action,Adventure,Drama",101,2013,"Tom Holland, Darren Morfitt, Corey Johnson, An...",Kevin Macdonald,Franz Lustig,"Meg Rosoff, Jeremy Brock, Penelope Skinner, To...","John Battsek, Alasdair Flind, Andrew Ruhemann,...",Jinx Godfrey,Jon Hopkins,/4FyDHVuuiy6XObYLyXxFUb4oX8J.jpg,22,"How I Live Now An American girl, sent to the E...",0.257303
7951,tt1852040,Myn Bala: Warriors of the Steppe,A universal story about the freedom of the hum...,6.5,"Action,Adventure,Drama",132,2012,"Ayan Utepbergen, Aidos Akmyzayev, Asylkhan Tol...",Akan Satayev,Khasan Kydyraliyev,"Muhammed Mamyrbekov, Jayik Sizdikov, Timur Zha...","Anna Kachko, Eskendir Nurbergen, Aliya Uvalzha...","Christopher Robin Bell, Nicolas Trembasiewicz",Renat Gaisin,/pSANogAuboSAxmcB0kEqfrl9FcD.jpg,22,Myn Bala: Warriors of the Steppe A universal s...,0.255218
11742,tt1972591,King Arthur: Legend of the Sword,"When the child Arthur’s father is murdered, Vo...",6.7,"Action,Adventure,Drama",126,2017,"Craig McGinlay, Tom Wu, Freddie Fox, Charlie H...",Guy Ritchie,John Mathieson,"Joby Harold, Guy Ritchie, Lionel Wigram, David...","Steve Clark-Hall, Akiva Goldsman, Joby Harold,...",James Herbert,Daniel Pemberton,/9kKXH6eJpzoFGhCbTN3FVwSQK3n.jpg,22,King Arthur: Legend of the Sword When the chil...,0.255191
3665,tt5317914,The Icebreaker,The story is based on the real events of 1985....,6.1,"Action,Adventure,Drama",124,2016,"Pyotr Fyodorov, Aleksandr Yatsenko, Sergey Pus...",Nikolay Khomeriki,Fedor Lyass,Aleksey Onishchenko,"Sergey Kozlov, Vasiliy Solovev",Ivan Lebedev,Tuomas Kantelinen,/zvVjhgkuGXivxdyYK8nTAfpROLF.jpg,22,The Icebreaker The story is based on the real ...,0.254847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21178,tt0048535,Rage at Dawn,"In this film's version of the story, four of t...",5.9,Western,87,1955,"Denver Pyle, Forrest Tucker, William Forrest, ...",Tim Whelan,Ray Rennahan,"Horace McCoy, Frank Gruber",Nat Holt,Harry Marker,Paul Sawtell,/oL49nHaB2SnhCDUHLcqqSBzcoXi.jpg,22,Rage at Dawn In this film's version of the sto...,0.090170
27949,tt0048288,Last of the Desperados,"After killing Billy the Kid, Sheriff Pat Garre...",6.4,Western,72,1955,"Donna Martell, Barton MacLane, Myrna Dell, Bob...",Sam Newfield,Edward Linden,Orville H. Hampton,Sigmund Neufeld,Holbrook N. Todd,Paul Dunlap,/wIS5w9Irkw6VvIpuzuTcH6wj03M.jpg,22,Last of the Desperados After killing Billy the...,0.090011
5676,tt0048789,The Violent Men,A former Union Army officer plans to sell out ...,6.9,Western,96,1955,"Lita Milan, May Wynn, Richard Jaeckel, Brian K...",Rudolph Maté,"W. Howard Greene, Burnett Guffey","Harry Kleiner, Donald Hamilton",Lewis J. Rachmil,Jerome Thoms,Max Steiner,/v56QMbVu29FCPffQGkkw31Gxuzj.jpg,22,The Violent Men A former Union Army officer pl...,0.089849
28917,tt0048665,Strange Lady in Town,"Julia Garth, a female doctor, plans to introdu...",6.1,Western,112,1955,"Cameron Mitchell, Joan Camden, Walter Hampden,...",Mervyn LeRoy,Harold Rosson,Frank Butler,Mervyn LeRoy,Folmar Blangsted,Dimitri Tiomkin,/2RcBhfPRgFgzNic4UkE71lE8EGA.jpg,22,"Strange Lady in Town Julia Garth, a female doc...",0.089837


In [76]:
#create user feedback

def get_interaction_data():
    # MySQL connection configuration
    mydb = mysql.connector.connect(
        host="localhost",
        user="root",
        password="Leicester69lol",
        database="users"
    )

    # Cursor object to execute SQL queries
    mycursor = mydb.cursor()

    # Table name in the database
    table_name = "user_recommended_interaction"

    # Define the SQL query to select interaction data for the given user_id
    select_query = "SELECT * FROM {}".format(table_name)

    # Execute the select query with user_id parameter
    mycursor.execute(select_query)

    # Fetch all rows of the result
    interaction_data = mycursor.fetchall()

    # Create a DataFrame from the fetched data
    df = pd.DataFrame(interaction_data, columns=["user_id", "tconst", "position", "similarity"])

    # Close the database connection
    mydb.close()

    return df

def create_user_feedback():
    interacted_films = get_interaction_data()
    users = interacted_films['user_id'].unique()
    all_user_feedback = []

    for user_id in users:
        user_id = int(user_id)

        watchlist = get_watchlist(user_id)    
        user_profile_pkg = get_user_profile(user_id)
        user_profile = user_profile_pkg[0]
        
        user_feedback_temp = pd.merge(interacted_films[interacted_films['user_id'] == user_id], user_profile[['tconst', 'likeage']], on='tconst', how='left')
        
        # Set likeage to 0.5 for films in the watchlist
        watchlist_tconsts = watchlist['tconst']
        user_feedback_temp.loc[user_feedback_temp['tconst'].isin(watchlist_tconsts), 'likeage'] = 0.5

        # Append the user feedback to the list
        all_user_feedback.append(user_feedback_temp)

    user_feedback = pd.concat(all_user_feedback, ignore_index=True)
    user_feedback.fillna(0, inplace=True)

    return user_feedback

def sigmoid(x, scale=1, offset=0):
    return 10 / (1 + np.exp(-scale * (x - offset)))

def calculate_interaction_score(position, similarity, likeage, similarity_weight=0.3, likeage_weight=0.4, position_weight=0.3):
    # Normalize similarity and likeage
    normalized_similarity = similarity / 100.0
    normalized_likeage = likeage * 10  # Already in the range [0, 10]
    
    # Apply weights to each factor and combine them to calculate the interaction score
    similarity_term = similarity_weight * normalized_similarity
    likeage_term = likeage_weight * normalized_likeage
    position_term = position_weight * sigmoid(position)
    
    interaction_score = similarity_term + likeage_term + position_term
    
    return interaction_score



user_feedback = create_user_feedback()
user_feedback['score'] = user_feedback.apply(lambda row: calculate_interaction_score(row['position'], row['similarity'], row['likeage']), axis=1)
user_feedback = user_feedback[user_feedback['user_id'] == 57]

user_feedback

Unnamed: 0,user_id,tconst,position,similarity,likeage,score
8,57,tt0081528,2,0,0.0,2.642391
9,57,tt0371746,0,20,1.0,5.56
10,57,tt0948470,0,20,1.0,5.56
11,57,tt1408101,0,19,0.590909,3.920636
12,57,tt2488496,8,18,1.0,7.052994
13,57,tt3748528,4,19,1.0,7.003041
14,57,tt3778644,4,20,1.0,7.006041
15,57,tt4154664,1,20,1.0,6.253176


In [46]:
import gym
from gym import spaces

if (1==1):
    # get groups of liked attributes
    grouped_likes = collate_liked_groups(user_profile)
    liked_plot = grouped_likes[0]
    liked_cast = grouped_likes[1]
    liked_crew = grouped_likes[2]
    liked_genre = grouped_likes[3]
    liked_meta = grouped_likes[4]

    user_profile_groups = {
        'plot': liked_plot,
        'cast': liked_cast,
        'crew': liked_crew,
        'genre': liked_genre,
        'meta': liked_meta
    }

    # Define the list of features excluding 'likeage'
    plot_features = [col for col in liked_plot.columns if col != 'likeage']
    crew_features = [col for col in liked_crew.columns if col != 'likeage']
    cast_features = [col for col in liked_cast.columns if col != 'likeage']
    genre_features = [col for col in liked_genre.columns if col != 'likeage']
    meta_features = [col for col in liked_meta.columns if col != 'likeage']

    # Call the function with the selected columns
    plot_matrix = create_similarity_vector(data[plot_features], liked_plot[plot_features])
    crew_matrix = create_similarity_vector(data[crew_features], liked_crew[crew_features])
    cast_matrix = create_similarity_vector(data[cast_features], liked_cast[cast_features])
    genre_matrix = create_similarity_vector(data[genre_features], liked_genre[genre_features])
    meta_matrix = create_euclidean_vector(data[meta_features], liked_meta[meta_features])

    similarity_vectors = {
        'plot': plot_matrix,
        'cast': cast_matrix,
        'crew': crew_matrix,
        'genre': genre_matrix,
        'meta': meta_matrix
    }

n_films = len(data) - len(user_profile)

def calculate_reward(action, state):
    # Assuming state contains information about recommended films and user feedback
    film_id = state[action]  # Get the film ID corresponding to the selected action
    film_score = user_feedback.loc[user_feedback['tconst'] == film_id, 'score']  # Get the score from user feedback
    return film_score

class RecommendationEnv(gym.Env):
    def __init__(self):
        self.action_space = spaces.Discrete(n_films)
        self.observation_space = spaces.Box(low=0, high=1, shape=(user_feedback.shape[1],), dtype=np.float32)
        self.state = None



    def reset(self):
        self.state = get_combined_recommendations(user_profile_groups, similarity_vectors, user_profile)
        return self.state

    def step(self, action):
        reward = calculate_reward(action, self.state)
        self.state = update_user_state(action, self.state)
        done = check_terminal_condition()
        return self.state, reward, done, {}
    
  

TypeError: 'ellipsis' object cannot be interpreted as an integer

In [45]:
# Calculate Hit Rate at top N recommendations
def calculate_hit_rate(df, n):
    top_n = df[df['position'] <= n]
    hit_rate = len(top_n) / len(df)
    return hit_rate

# Calculate ARHR and Hit Rate for top N recommendations
hit_rate = calculate_hit_rate(user_feedback, 50)

for n in range(10, 101, 10):
    # Calculate Hit Rate at top N recommendations
    hit_rate = calculate_hit_rate(user_feedback, n)
    print("Hit Rate at Top", n , "Recommendations:", hit_rate*100, "%")



Hit Rate at Top 10 Recommendations: 72.72727272727273 %
Hit Rate at Top 20 Recommendations: 90.9090909090909 %
Hit Rate at Top 30 Recommendations: 100.0 %
Hit Rate at Top 40 Recommendations: 100.0 %
Hit Rate at Top 50 Recommendations: 100.0 %
Hit Rate at Top 60 Recommendations: 100.0 %
Hit Rate at Top 70 Recommendations: 100.0 %
Hit Rate at Top 80 Recommendations: 100.0 %
Hit Rate at Top 90 Recommendations: 100.0 %
Hit Rate at Top 100 Recommendations: 100.0 %
