In [95]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'matplotlib'


The dataset we will use is the Netflix prize dataset, which contains more than a million ratings for different movies over the years. 


The traning set of the netflix database consists of 1) 4 txt files with the following structure: a row with the film id, and under it a series of rows containing the customer id, film rating and date; 2) a csv containing the movie id, year of release and title 

For this project, we will build a subset of this database (500.000 lines) and add the information for film title and genre.

Step 1: 
Load data from the txt files, restructure data to create the following columns: film id, customer id, rating, date; and create radom sample of 500.000 lines 


In [56]:
# define directory 

data_dir = './Netflix_data/' 

# Our data is divided in 4 files. we will take a proportional sample from each file. Here we define the number of rows to randomly select from each file
PROPORTIONAL_TARGETS = {
    'combined_data_1.txt': 119695,
    'combined_data_2.txt': 134242,
    'combined_data_3.txt': 112469,
    'combined_data_4.txt': 133594
}

# get file names - they have the same structure 
file_names = glob.glob(os.path.join(data_dir, 'combined_data_*.txt'))

#define size of our sample 
TOTAL_TARGET_LINES = 500000

#initialize sampled dfs and film id 
all_sampled_dfs = [] 
current_film_id = None

#start restructuring and sampling 
print(f"Starting Proportional Restructuring and Sampling (Total Target: {TOTAL_TARGET_LINES:,})...")
print("---")

# Here we do a loop where for each file, we set the sample size and we restructure the data.

for file_path in file_names:
    # Use os.path.basename to get the key for the PROPORTIONAL_TARGETS dictionary
    file_base_name = os.path.basename(file_path)
    
    # Get the required sample size for this specific file
    target_sample_size = PROPORTIONAL_TARGETS.get(file_base_name, 0)
    
    if target_sample_size == 0:
        print(f"Warning: Sample size not defined for {file_base_name}. Skipping.")
        continue
        
    print(f"Processing and sampling {file_base_name} (Target: {target_sample_size:,} lines)...")
    
    data_rows = []
    
    # 1. Restructure: Read the data, estract film id if the row contains the film id, and separate customer id, rating and date if the row contains these data ù
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                current_film_id = int(line[:-1])
            elif line:
                try:
                    customer_id, rating, date = line.split(',')
                    # Store the complete, flattened row
                    data_rows.append([current_film_id, int(customer_id), int(rating), date])
                except ValueError:
                    # Skip malformed rating lines
                    continue
    
    # 2. Convert to DataFrame (only the current file's data)
    current_file_df = pd.DataFrame(
        data_rows, 
        columns=['movie_id', 'customer_id', 'rating', 'date'])
        
# 3. Sample IMMEDIATELY
    if len(current_file_df) < target_sample_size:
        print(f"Warning: File {file_base_name} had fewer lines than the target. Taking all {len(current_file_df):,} lines.")
        sampled_df = current_file_df.copy()
    else:
        # random_state=42 ensures this sample is reproducible
        sampled_df = current_file_df.sample(n=target_sample_size, random_state=42)
    
    # 4. Store the small sample and release the large intermediate DataFrame
    all_sampled_dfs.append(sampled_df)
    del current_file_df # Explicitly free up memory
    
    print(f"Finished sampling from {file_base_name}. Sample size collected: {len(sampled_df):,}")

# 5. Combine the small samples into the final DataFrame
print("---")
print("Combining all small samples...")
final_sampled_data = pd.concat(all_sampled_dfs, ignore_index=True)

# 6. Final Output
output_file = 'netflix_sampled_500k_proportional.csv'
final_sampled_data.to_csv(output_file, index=False)

print(f"Process complete! The final proportional dataset has **{len(final_sampled_data):,}** lines.")
print(f"Saved to **{output_file}**.")


Starting Proportional Restructuring and Sampling (Total Target: 500,000)...
---
Processing and sampling combined_data_4.txt (Target: 133,594 lines)...
Finished sampling from combined_data_4.txt. Sample size collected: 133,594
Processing and sampling combined_data_3.txt (Target: 112,469 lines)...
Finished sampling from combined_data_3.txt. Sample size collected: 112,469
Processing and sampling combined_data_2.txt (Target: 134,242 lines)...
Finished sampling from combined_data_2.txt. Sample size collected: 134,242
Processing and sampling combined_data_1.txt (Target: 119,695 lines)...
Finished sampling from combined_data_1.txt. Sample size collected: 119,695
---
Combining all small samples...
Process complete! The final proportional dataset has **500,000** lines.
Saved to **netflix_sampled_500k_proportional.csv**.


In [57]:
df_ratings = pd.read_csv("netflix_sampled_500k_proportional.csv")


df_ratings

Unnamed: 0,movie_id,customer_id,rating,date
0,14086,2310762,5,2005-07-07
1,16390,1882111,4,2004-03-04
2,16565,998169,2,2005-10-21
3,15107,2427506,4,2003-07-05
4,15755,618829,4,2005-06-14
...,...,...,...,...
499995,2128,462,5,2005-05-09
499996,1974,90928,5,2005-11-01
499997,478,1197493,1,2004-07-09
499998,2375,2328701,5,2004-06-03


Step 2. Load and join the movie titles file


In [58]:
#Read and clean the movie titles file 
titles_rows = []
    
with open('movie_titles.csv', 'r', encoding = 'latin-1') as f: 
    for line in f: 
      line = line.strip()
      parts = line.split(',', 2)

      if len(parts) == 3:
            film_id, year, title = parts
            titles_rows.append([int(film_id), year, title])

df_titles = pd.DataFrame(titles_rows, columns=['movie_id', 'year', 'title'])


In [59]:
df_titles



Unnamed: 0,movie_id,year,title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [60]:
#merge the movie titles file with the rating file

df_ratings_titles   = pd.merge(df_ratings, df_titles, how = 'left', on = 'movie_id')



df_ratings_titles

Unnamed: 0,movie_id,customer_id,rating,date,year,title
0,14086,2310762,5,2005-07-07,1999,Dawson's Creek: Season 3
1,16390,1882111,4,2004-03-04,2002,Panic Room
2,16565,998169,2,2005-10-21,2001,K-Pax
3,15107,2427506,4,2003-07-05,2001,Ocean's Eleven
4,15755,618829,4,2005-06-14,1988,Big
...,...,...,...,...,...,...
499995,2128,462,5,2005-05-09,1993,Rudy
499996,1974,90928,5,2005-11-01,1995,Il Postino
499997,478,1197493,1,2004-07-09,1962,The Beverly Hillbillies
499998,2375,2328701,5,2004-06-03,1989,Fletch Lives


Step 3. Load and join the genres file

In [61]:
#read the genre file
df_genres = pd.read_csv('netflix_genres.csv')

df_genres

Unnamed: 0,movieId,genres
0,1,Documentary|Animation|Family
1,3,Crime|Drama|Mystery
2,4,Family
3,5,Documentary|Sport
4,6,Documentary
...,...,...
12274,17764,Comedy|Drama|History|Romance
12275,17765,Action|Adventure|Family|Sci-Fi
12276,17768,Action|Drama|Fantasy
12277,17769,Drama|Music|Romance


In [62]:
#merge the df with the genre file 

df = pd.merge(df_ratings_titles, df_genres, how = 'left', left_on = 'movie_id', right_on = 'movieId').drop('movieId', axis = 1)

df

Unnamed: 0,movie_id,customer_id,rating,date,year,title,genres
0,14086,2310762,5,2005-07-07,1999,Dawson's Creek: Season 3,
1,16390,1882111,4,2004-03-04,2002,Panic Room,Crime|Drama|Thriller
2,16565,998169,2,2005-10-21,2001,K-Pax,Drama|Sci-Fi
3,15107,2427506,4,2003-07-05,2001,Ocean's Eleven,Crime|Thriller
4,15755,618829,4,2005-06-14,1988,Big,Comedy|Drama|Fantasy|Romance
...,...,...,...,...,...,...,...
499995,2128,462,5,2005-05-09,1993,Rudy,Biography|Drama|Sport
499996,1974,90928,5,2005-11-01,1995,Il Postino,Talk-Show
499997,478,1197493,1,2004-07-09,1962,The Beverly Hillbillies,Comedy|Family
499998,2375,2328701,5,2004-06-03,1989,Fletch Lives,Comedy|Crime|Mystery


Step 4. Data exploration, missing data treatment and variable creation 


In [71]:
### Première choses dans l'analyse exploratoire: Connaître le Dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   movie_id     500000 non-null  int64         
 1   customer_id  500000 non-null  int64         
 2   rating       500000 non-null  int64         
 3   date         500000 non-null  datetime64[ns]
 4   year         500000 non-null  object        
 5   title        500000 non-null  object        
 6   genres       449270 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 26.7+ MB


In [None]:
### change type for the year and date variables which are strings 

df["date"] = pd.to_datetime(df["date"])
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df.info() ### On voit bien que maintenant tout est dans le bon format

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   movie_id     500000 non-null  int64         
 1   customer_id  500000 non-null  int64         
 2   rating       500000 non-null  int64         
 3   date         500000 non-null  datetime64[ns]
 4   year         499997 non-null  float64       
 5   title        500000 non-null  object        
 6   genres       449270 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 26.7+ MB


In [None]:
#check for missing data 
df.isnull().sum()

movie_id           0
customer_id        0
rating             0
date               0
year               3
title              0
genres         50730
dtype: int64

In [None]:
#we have missing data for the year so let's see what they are 
df.loc[(df['year'].isna())]



Unnamed: 0,movie_id,customer_id,rating,date,year,title,genres
105506,15918,2620163,3,2002-11-07,,Hote Hote Pyaar Ho Gaya,
122086,17667,855878,5,2004-04-19,,Eros Dance Dhamaka,
359870,7241,2248421,3,2005-08-24,,Ancient Civilizations: Athens and Greece,


In [88]:
#let's drop them for now (@Paulo being only three titles we could fill them with the real values, opinions? )
df = df.dropna(subset={"year"})


In [None]:
#check the na again 

df.isnull().sum()

#year is good now 

movie_id           0
customer_id        0
rating             0
date               0
year               0
title              0
genres         50727
dtype: int64

In [85]:
#we have missing data in the genre column so let's visualize them 
df.loc[(df['genres'].isna())]




Unnamed: 0,movie_id,customer_id,rating,date,year,title,genres
0,14086,2310762,5,2005-07-07,1999.0,Dawson's Creek: Season 3,
8,17633,416556,4,2004-06-21,1998.0,Joseph and the Amazing Technicolor Dreamcoat,
9,15306,1792741,3,2005-07-19,2003.0,Sealab 2021: Season 3,
34,14621,882359,4,2003-10-11,2001.0,Shrek (Full-screen),
53,14302,1360238,5,2004-10-19,2000.0,The Sopranos: Season 2,
...,...,...,...,...,...,...,...
499954,4243,200362,3,2005-07-10,1995.0,The Desperate Trail,
499966,111,911778,2,2004-06-14,2003.0,Duplex (Widescreen),
499979,1476,429060,3,2005-11-07,2004.0,Six Feet Under: Season 4,
499991,3253,269152,3,2004-09-15,1998.0,The Opposite of Sex,


In [None]:
# fill the NaN with 'Empty' (@Paulo: for now, tell me if you have other ideas for missing values)
df['genres'].fillna('Empty')

0                                Empty
1                 Crime|Drama|Thriller
2                         Drama|Sci-Fi
3                       Crime|Thriller
4         Comedy|Drama|Fantasy|Romance
                      ...             
499995           Biography|Drama|Sport
499996                       Talk-Show
499997                   Comedy|Family
499998            Comedy|Crime|Mystery
499999                           Drama
Name: genres, Length: 500000, dtype: object

In [68]:
# i want to see how many of these are tv series and not movies, documentaries etc
 

#filter the titles that contain 'season '. This is the most common structure of the title when it is a series but some others could be left out, let's just have a look
series = df_final['Title'].str.contains('Season ')



print(series.sum())


df_final[series]




15075


Unnamed: 0,FilmID,CustomerID,Rating,Date,Year,Title,genres
0,14086,2310762,5,2005-07-07,1999,Dawson's Creek: Season 3,
9,15306,1792741,3,2005-07-19,2003,Sealab 2021: Season 3,
53,14302,1360238,5,2004-10-19,2000,The Sopranos: Season 2,
66,13663,1948046,5,2005-11-16,2000,South Park: Season 4,
69,16436,863285,2,2005-09-13,1966,The Monkees: Season 1,
...,...,...,...,...,...,...,...
499885,4407,1197493,3,2004-10-04,1993,Boy Meets World: Season 1,
499886,223,1070779,5,2005-08-03,2003,Chappelle's Show: Season 1,
499906,2172,60269,4,2003-10-01,1991,The Simpsons: Season 3,
499979,1476,429060,3,2005-11-07,2004,Six Feet Under: Season 4,


In [73]:
#let's start with the new variables 

#the first column we add the decade
# it is a conditional column: if 1980<=year=>1989 -> 1980s 


#let's check the min and max 
df.describe()

Unnamed: 0,movie_id,customer_id,rating,date
count,500000.0,500000.0,500000.0,500000
mean,9073.441748,1321780.0,3.606346,2004-10-09 07:01:26.688000256
min,1.0,6.0,1.0,1999-12-11 00:00:00
25%,4683.0,659282.5,3.0,2004-04-30 00:00:00
50%,9051.0,1318721.0,4.0,2005-01-23 00:00:00
75%,13630.75,1984480.0,4.0,2005-07-09 00:00:00
max,17770.0,2649429.0,5.0,2005-12-31 00:00:00
std,5131.325278,765776.7,1.084793,


In [91]:
###on voit que year min = 1896 et max= 2005 donc
conditions = [
    (df["year"] >= 1890) & (df["year"] <= 1899),
    (df["year"] >= 1900) & (df["year"] <= 1909),
    (df["year"] >= 1910) & (df["year"] <= 1919),
    (df["year"] >= 1920) & (df["year"] <= 1929),
    (df["year"] >= 1930) & (df["year"] <= 1939),
    (df["year"] >= 1940) & (df["year"] <= 1949),
    (df["year"] >= 1950) & (df["year"] <= 1959),
    (df["year"] >= 1960) & (df["year"] <= 1969),
    (df["year"] >= 1970) & (df["year"] <= 1979),
    (df["year"] >= 1980) & (df["year"] <= 1989),
    (df["year"] >= 1990) & (df["year"] <= 1999),
    (df["year"] >= 2000) & (df["year"] <= 2005)
]

values = [
    "1890s",
    "1900s",
    "1910s",
    "1920s",
    "1930s",
    "1940s",
    "1950s",
    "1960s",
    "1970s",
    "1980s",
    "1990s",
    "2000s"
]

df["decade"] = np.select(conditions, values, default="Out of Range")

In [93]:
### Maintenant on groupe les decades et le ratings

df_view_decades = (
    df[["decade", "rating"]]
    .groupby("decade", as_index=False)
    .mean()
)


FIRST VISUALIZATIONS

In [None]:
def plot_bar(df, col_x, col_y, title=None):
    plt.figure()
    plt.bar(df[col_x], df[col_y])
    
    plt.xlabel(col_x)
    plt.ylabel(col_y)
    
    if title:
        plt.title(title)
    
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
### Comme prèmiere view on peut faire:
    
plot_bar(df_view_decades, col_x="decade", col_y="rating", title="Rating per decade (movie release)")

NameError: name 'plot_bar' is not defined