In [1]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
import matplotlib.pyplot as plt
import seaborn as sns ; sns.set()


In [2]:
load_dotenv('BDD_URL.env')
BDD_URL = os.environ['BDD_URL']
engine = create_engine(BDD_URL)

SQL_director_crewnames = """
SET search_path to principal;
SELECT "primaryTitle", "averageRating", "titleType", "startYear", "runtimeMinutes", "genres", "isAdult", "directors", "writers" , array_agg(name_basics."primaryName") AS director_names
        
        from title_basics 
        
        join title_ratings on title_basics."tconst" = title_ratings."tconst"
        
        join title_crew on title_basics."tconst" = title_crew."tconst"
        
        join name_basics on name_basics.nconst = ANY(string_to_array(title_crew."directors", ','))
        
        GROUP BY "primaryTitle", "averageRating", "titleType", "startYear", "runtimeMinutes", "genres", "isAdult", "directors", "writers"
        
        limit 2;
"""

SQL_SIMPLE = """
SET search_path to principal;
SELECT "primaryTitle", 
        "averageRating", 
        "titleType", 
        "startYear", 
        "runtimeMinutes", 
        "genres", 
        "isAdult", 
        "directors", 
        "writers"
        
        from title_basics 
        
        join title_ratings on title_basics."tconst" = title_ratings."tconst"
        
        join title_crew on title_basics."tconst" = title_crew."tconst"
        
        limit 10;
"""

SQL_filmview= """
SELECT tb.tconst,
    tb."primaryTitle",
    tb."titleType",
    tb."isAdult",
    tb."startYear",
    tb."endYear",
    tb."runtimeMinutes",
    tb.genres,
    rt."averageRating",
    rt."numVotes",
    array_agg((tp.category || '_'::text) || replace(nb."primaryName", ' '::text, '_'::text)) AS "Cate&names"
   FROM principal.title_basics tb
     JOIN principal.title_ratings rt ON tb.tconst::text = rt.tconst::text
     JOIN principal.title_principals tp ON tb.tconst::text = tp.tconst::text
     JOIN principal.name_basics nb ON tp.nconst::text = nb.nconst::text
  GROUP BY tb.tconst, rt."averageRating", rt."numVotes";

"""

SQL= """
SET search_path to principal;
SELECT *
from "filmview"
limit 10000;
"""
df = pd.read_sql(SQL, engine)
engine.dispose()
df

Unnamed: 0,tconst,primaryTitle,titleType,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Cate&names
0,tt1462767,Terminated,short,0,2011.0,,15.0,"Drama,Mystery,Short",6.1,21,"[actress_Britt_Loren, actor_Darrel_Cherney, ac..."
1,tt1462768,The Drifter,movie,0,2009.0,,60.0,"Drama,Sport",7.1,318,"[director_Taylor_Steele, producer_Justine_Chia..."
2,tt1462769,The Odd Life of Timothy Green,movie,0,2012.0,,105.0,"Comedy,Drama,Family",6.6,48365,"[composer_Geoff_Zanelli, writer_Ahmet_Zappa, a..."
3,tt1462772,When Christ Was Alive,video,0,2009.0,,,Documentary,1.4,7,"[self_Greg_Robbins, actress_Katrina_Miller, pr..."
4,tt14627778,Love is Just a Dream,tvEpisode,0,2021.0,,,"Drama,Mystery",9.0,30,"[actress_Ok_Ja-yeon, writer_Baek_Mi-kyeong, ac..."
...,...,...,...,...,...,...,...,...,...,...,...
9995,tt14851694,The Grotto,movie,0,2022.0,,96.0,"Comedy,Drama",7.6,16,"[actor_Larry_Sullivan, actress_Susan_Sullivan,..."
9996,tt14851894,Khochai,tvMiniSerie,0,2021.0,2021.0,,Drama,7.6,10,"[actor_Anisur_Rahman_Milon, actor_Pran_Roy, ac..."
9997,tt14851900,Being Black in Porn,movie,0,2021.0,,97.0,Documentary,9.5,9,"[actor_August_Alexander, editor_Sean_Hopley, a..."
9998,tt14851954,Album Mode,tvEpisode,0,2022.0,,29.0,"Comedy,Music",7.6,131,"[actress_Busy_Philipps, writer_Meredith_Scardi..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          10000 non-null  object 
 1   primaryTitle    10000 non-null  object 
 2   titleType       10000 non-null  object 
 3   isAdult         10000 non-null  int64  
 4   startYear       9998 non-null   float64
 5   endYear         347 non-null    float64
 6   runtimeMinutes  6216 non-null   float64
 7   genres          9906 non-null   object 
 8   averageRating   10000 non-null  float64
 9   numVotes        10000 non-null  int64  
 10  Cate&names      10000 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 859.5+ KB


In [4]:
# df["averageRating"].fillna(df["averageRating"].mean(), inplace=True)
# df["startYear"].fillna(df["startYear"].mean(), inplace=True)
# df["runtimeMinutes"].fillna(df["runtimeMinutes"].mean(), inplace=True)

# df['isAdult'] = df['isAdult'].astype(str)
# df["isAdult"] = df["isAdult"].apply(lambda x: 'Adult' if x == "True" else 'Notadult')

In [5]:
def BooleanToText (df):
    return df.apply(lambda x: 'True' if x == 1 else 'False')

In [6]:
def DateToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(1800, 2056, 5))  # Intervalles de 5
    labels = [f"between{start}and{start+4}" for start in range(1800, 2051, 5)]

    return pd.cut(df, bins=bins, labels=labels, right=False)


In [7]:
def RuntimeToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 615, 15))  # Intervalles de 10h
    labels = [f"runtime_Between{start}and{start+4}" for start in range(0, 600, 15)]

    return pd.cut(df, bins=bins, labels=labels, right=False)

In [8]:
def RatingToCategory (df):
    
    df.fillna(df.mean(), inplace=True) # a valider
    
    bins = list(range(0, 12, 2))  
    labels = ['*','**','***','****','*****']

    return pd.cut(df, bins=bins, labels=labels, right=False)

In [9]:
def listTostr (df):
    return df.apply(lambda x: ' '.join(map(str, x)))

In [10]:
df['feature'] = df['primaryTitle'] + ' '

df['feature'] += 'titleType_'+df['titleType'] + ' '

df['feature'] += 'Rating_'+RatingToCategory(df['averageRating']).astype(str) + ' '

df['feature'] += 'startYear_'+DateToCategory(df['startYear']).astype(str) + ' '

df['feature'] += RuntimeToCategory (df['runtimeMinutes']).astype(str)+ ' '

df['feature'] += 'genre_'+df['genres'].astype(str)+' '

df['feature'] += 'ADULT_'+BooleanToText (df['isAdult']).astype(str)+' '

df['feature'] += listTostr (df['Cate&names']).astype(str)+' '

df['feature'][0]

'Terminated titleType_short Rating_**** startYear_between2010and2014 runtime_Between15and19 genre_Drama,Mystery,Short ADULT_False actress_Britt_Loren actor_Darrel_Cherney actress_Heidi_Herrmann director_Deon_Taylor producer_Yonatan_K._Mallinger writer_Diana_Erwin director_Tobias_Deml actress_Elizabeth_Lauren writer_Thomas_Fenton producer_Melissa_Weing '

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()

count_matrix = cv.fit_transform(df['feature'])

cosine_sim = cosine_similarity(count_matrix)

cosine_sim

array([[1.        , 0.15      , 0.13693064, ..., 0.09759001, 0.15389675,
        0.11952286],
       [0.15      , 1.        , 0.18257419, ..., 0.14638501, 0.10259784,
        0.11952286],
       [0.13693064, 0.18257419, 1.        , ..., 0.13363062, 0.14048787,
        0.16366342],
       ...,
       [0.09759001, 0.14638501, 0.13363062, ..., 1.        , 0.15018785,
        0.17496355],
       [0.15389675, 0.10259784, 0.14048787, ..., 0.15018785, 1.        ,
        0.30656967],
       [0.11952286, 0.11952286, 0.16366342, ..., 0.17496355, 0.30656967,
        1.        ]])

In [12]:
def findfilm(index):
    return df.iloc[index][['tconst', 'primaryTitle']].tolist()

def getindex(filmm):
    return df[df['primaryTitle'] == filmm].index[0]

def Indexliste(array,listlent):
    R = list(enumerate(array, 0))
    sort_R=sorted(R, key=lambda x: x[1], reverse=True)
    sort_R=sort_R[1:listlent+1] # les 1 pour supprimer le film lui meme de la list des recommendations
    return sort_R


In [13]:


def recommend(matrice=cosine_sim, film='' , Nbfilm=5):
    indexfilm = getindex(film)
    vecteursimilarite = matrice[indexfilm]
    liste = Indexliste(vecteursimilarite, Nbfilm)
    # rekomand = [[movie[0],findfilm(movie[0]),round(movie[1],2)] for movie in liste]
    rekomand = [findfilm(movie[0]) for movie in liste]
    return rekomand




In [15]:
recommend(film='Terminated', Nbfilm=10)

[['tt1469855', 'Alone'],
 ['tt1474808', 'Number 9'],
 ['tt14689810', 'A Cohabitation'],
 ['tt1482259', 'Varjot'],
 ['tt1482445', 'Empirical'],
 ['tt1482446', 'Encrucijada'],
 ['tt14760780', 'Zarnitsa'],
 ['tt1470707', 'The White Snake'],
 ['tt1471157', 'Dalgalar'],
 ['tt14714340', 'Forest King']]

In [None]:
# def recommend(filmmm):
#     # Trouve l'indice du film dans le DataFrame basics_df où le titre correspond à filmmm
#     indexfilm = basics_df[basics_df['primaryTitle'] == filmmm].index[0]
#     print(indexfilm)
#     # Enumère les scores de similarité pour le film spécifié dans la matrice cosine_sim
#     scoresimilarity = list(enumerate(cosine_sim[indexfilm]))
#     print(type(scoresimilarity))
#     # Trie les scores de similarité dans l'ordre décroissant
#     scoresimilarity = sorted(scoresimilarity, key=lambda x: x[1], reverse=True)
#     print(scoresimilarity)
#     # Supprime le premier élément, car il s'agit du film lui-même (score de similarité maximal avec lui-même)
#     scoresimilarity = scoresimilarity[1:]
#     print(scoresimilarity)
#     # Sélectionne les cinq premiers films avec les scores de similarité les plus élevés
#     topfilms = scoresimilarity[:5]
#     print(scoresimilarity)
#     # Pour chaque film sélectionné, trouve son titre et ajoute-le à la liste de recommandations
#     rekomand = [findfilm(movie[0]) for movie in topfilms]
#     print(rekomand)
#     # Retourne la liste des recommandations
#     return rekomand

# recommendations = recommend("Secret Lives")
# print(recommendations)

