# <div style='color:white;background: #005792;text-align: center;padding: 15px 0'>Recommandations - Préparation des données Title basics</div>

## Participants
* Samantha
* Rachelle
* Andrew

## <div style='background: #005792;text-align: center;padding: 15px 0'> <a style= 'color:white;' >Configuration des variables globales</a></div>

### Installation des librairies

In [1]:
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn
# !pip install plotly-express
# !pip install plotly

### Importation des librairies

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re

### Chargement des fichiers

In [2]:
source_dir= '/home/dstrec/dstrec/010_data/000_source/imdb_datasets'
title_basics = 'title.basics.tsv'
title_ratings = 'title.ratings.tsv'

file_title_basics = f"{source_dir}/{title_basics}"
file_title_ratings = f"{source_dir}/{title_ratings}"

### Configuration des fonctions

In [4]:
def add_adult_genre(row):
    genres = str(row['genres']).split(',')
    if row['isAdult']:
        genres.append('forAdult')
    return ','.join(genres)


def clean_and_filter_data(df, ratings_df):
    df = df[['tconst', 'titleType', 'primaryTitle', 'isAdult', 'genres']]
    df.replace('\\N', None, inplace=True)
    df['isAdult'] = df['isAdult'].astype(bool)
    df['genres'] = df.apply(add_adult_genre, axis=1)
    df.drop(columns=['isAdult'], inplace=True)
    df = df[df['titleType'] == 'movie']
    df = df.merge(ratings_df, on='tconst', how='left')
    return df 

## <div style='background: #005792;text-align: center;padding: 15px 0'> <a style= 'color:white;' >Préparation des données</a></div>

### Chargement des jeux de données

In [None]:
ratings_df = pd.read_csv(file_title_ratings, sep='\t', na_values='\\N', low_memory=False)

In [5]:
chunk_size = 300000
basics_chunk = [] 
for chunk in pd.read_csv(file_title_basics, sep='\t', na_values='\\N', chunksize=chunk_size):
    
    chunk = clean_and_filter_data(chunk, ratings_df)
    basics_chunk.append(chunk)
    
df_basics = pd.concat(basics_chunk)
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,genres,averageRating,numVotes
0,tt0000009,movie,Miss Jerry,Romance,5.4,212.0
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport",5.2,515.0
2,tt0000502,movie,Bohemios,,4.4,17.0
3,tt0000574,movie,The Story of the Kelly Gang,"Action,Adventure,Biography",6.0,900.0
4,tt0000591,movie,The Prodigal Son,Drama,5.4,24.0


### EDA

In [6]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 681499 entries, 0 to 10552
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         681499 non-null  object 
 1   titleType      681499 non-null  object 
 2   primaryTitle   681497 non-null  object 
 3   genres         681499 non-null  object 
 4   averageRating  310399 non-null  float64
 5   numVotes       310399 non-null  float64
dtypes: float64(2), object(4)
memory usage: 36.4+ MB


### Valeurs manquantes

In [7]:
df_basics.isna().sum()

tconst                0
titleType             0
primaryTitle          2
genres                0
averageRating    371100
numVotes         371100
dtype: int64

### Suppression de la colonne `primaryTitle`

In [8]:
df_basics = df_basics.dropna(subset='primaryTitle')

### Supression des valeurs dupliquées

In [9]:
duplicate_titles = df_basics[df_basics.duplicated('primaryTitle', keep=False)]
duplicate_counts = duplicate_titles['titleType'].value_counts()
duplicate_counts

titleType
movie    136991
Name: count, dtype: int64

In [10]:
df_basics_sorted = df_basics.sort_values(by='numVotes', ascending=False)
df_basics_no_duplicates = df_basics_sorted.drop_duplicates(subset='primaryTitle', keep='first')
df_basics_no_duplicates.reset_index(drop=True, inplace=True)

### EDA

In [11]:
df_basics_no_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587087 entries, 0 to 587086
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         587087 non-null  object 
 1   titleType      587087 non-null  object 
 2   primaryTitle   587087 non-null  object 
 3   genres         587087 non-null  object 
 4   averageRating  275270 non-null  float64
 5   numVotes       275270 non-null  float64
dtypes: float64(2), object(4)
memory usage: 26.9+ MB


### Vérification des données

In [12]:
duplicate_titles_no_duplicates = df_basics_no_duplicates[df_basics_no_duplicates.duplicated('primaryTitle', keep=False)]
duplicate_counts_no_duplicates = duplicate_titles_no_duplicates['titleType'].value_counts()
duplicate_counts_no_duplicates

Series([], Name: count, dtype: int64)

In [13]:
df_basics_no_duplicates.isna().sum()

tconst                0
titleType             0
primaryTitle          0
genres                0
averageRating    311817
numVotes         311817
dtype: int64

### Affichage du jeu de données

In [14]:
df_basics_tconst = df_basics_no_duplicates['tconst']
df_basics_tconst.head()

0    tt0111161
1    tt0468569
2    tt1375666
3    tt0137523
4    tt0109830
Name: tconst, dtype: object

### Insertion dans un fichier CSV

In [15]:
dest_dir ='/home/dstrec/dstrec/010_data/001_transformed'
title_basics_csv = 'title_basics.csv'
title_basics_tconst_csv = 'title_basics_tconst.csv'

output_file_title_basics = f"{dest_dir}/{title_basics_csv}"
output_file_title_basics_tconst = f"{dest_dir}/{title_basics_tconst_csv}"

df_basics_no_duplicates.to_csv(output_file_title_basics, index=False)
df_basics_tconst.to_csv(output_file_title_basics_tconst, index=False)