# **Pré-traitement sur DF_title_basics_clean**

# Objectif
Nettoyer et préparer les données de Dataset DF_title_basics pour qu'elles soient prêtes à l’analyse.

# Recap Traitement réalisé sur ce DF:
- on garde que les "movie"
- on garde que les "isAdult" = 0
- on garde que la durée "runtimeMinutes"> 40
- on remplacer les valeurs \N par NaN
- on supprime les films pour adultes
- on Supprime la colonne EndYear, isAdult
- pour les films où le genre n'est pas renseigné : indiquer genre 'divers'
- on supprime les lignes restantes avec les valeurs manquantes

## 1️⃣ Chargement des bibliothèques et du DataFrame

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [None]:
df_title_basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep='\t',low_memory=False)
df_title_basics.shape

(10790736, 9)

## 2️⃣ Exploration initiale du DataFrame
- Aperçu des colonnes (`df.info()`, `df.head()`)
- Description rapide : dimensions, types, aperçu des valeurs

In [None]:
df_title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10790736 entries, 0 to 10790735
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 740.9+ MB


In [None]:
df_title_basics['titleType'].value_counts()

titleType
tvEpisode       8260949
short            995864
movie            681499
video            292469
tvSeries         263691
tvMovie          145677
tvMiniSeries      54319
tvSpecial         47633
videoGame         38319
tvShort           10315
tvPilot               1
Name: count, dtype: int64

## 3️⃣ Nettoyage des valeurs manquantes et incohérences

In [None]:
# Remplacer les valeurs \N par 'NaN' (pas de valeur)  =>
df_title_basics = df_title_basics.replace('\\N',np.nan)

In [None]:
# Trouver les valeurs manquantes parmi toutes les colonnes en travaillant sur les lignes
df_title_basics = df_title_basics[df_title_basics.isna().any(axis=1)]

In [None]:
# Premier NETTOYAGE de dataset en enlevant les informations inutiles pour diminuer la DF:
# - on garde que les "movie"
# - on garde que les "isAdult" = 0
# - on garde que la durée "runtimeMinutes"> 40

df_title_basics = df_title_basics[(df_title_basics['titleType'] == "movie") & (df_title_basics['isAdult'] == '0') & (df_title_basics['runtimeMinutes'] > '40')]
df_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
930,tt0000941,movie,Locura de amor,Locura de amor,0,1909,,45,Drama
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
...,...,...,...,...,...,...,...,...,...
10790418,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
10790457,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
10790502,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
10790627,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary


In [None]:
# Ino sur DataFrame après traitement :
df_title_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 307793 entries, 8 to 10790686
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          307793 non-null  object
 1   titleType       307793 non-null  object
 2   primaryTitle    307792 non-null  object
 3   originalTitle   307792 non-null  object
 4   isAdult         307793 non-null  object
 5   startYear       302919 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  307793 non-null  object
 8   genres          287772 non-null  object
dtypes: object(9)
memory usage: 23.5+ MB


In [None]:
df_title_basics.shape

(307793, 9)

In [None]:
# Trouver les valeurs manquantes parmi toutes les colonnes en travaillant sur les lignes
df_title_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           1
originalTitle          1
isAdult                0
startYear           4874
endYear           307793
runtimeMinutes         0
genres             20021
dtype: int64

In [None]:
# on cherche les doublons (s'il y en a): => il n'y a pas
duplicated_rows= df_title_basics.duplicated()
df_title_basics.loc[duplicated_rows,:]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [None]:
# Supprimer la colonnes endYear
df_title_basics = df_title_basics.drop(['endYear'], axis=1)

In [None]:
# Supprimer la colonnes isAdult
df_title_basics= df_title_basics.drop(['isAdult'], axis=1)

In [None]:
# Colonne "genre" : Remplacer les valeurs nulles par 'Divers'
df_title_basics = df_title_basics.fillna({'genres': 'Divers'})

In [None]:
df_title_basics.isna().sum()

tconst               0
titleType            0
primaryTitle         1
originalTitle        1
startYear         4874
runtimeMinutes       0
genres               0
dtype: int64

In [None]:
# on supprime les lignes restantes (notamment "startYaer") avec les valeurs manquantes:
df_title_basics.dropna(inplace=True)

In [None]:
df_title_basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

## Résumé final

In [None]:
df_title_basics_clean = df_title_basics
df_title_basics_clean

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,1894,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,1907,90,Drama
930,tt0000941,movie,Locura de amor,Locura de amor,1909,45,Drama
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,1910,58,"Adventure,Drama"
...,...,...,...,...,...,...,...
10790418,tt9916190,movie,Safeguard,Safeguard,2020,95,"Action,Adventure,Thriller"
10790457,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,2020,84,Thriller
10790502,tt9916362,movie,Coven,Akelarre,2020,92,"Drama,History"
10790627,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,57,Documentary
