In [None]:

# Project overview and goals: 
# In this project we explore the training set of the Netflix prize dataset. The Netflix Prize was an open competition for the best collaborative filtering algorithm to predict user ratings for films, based on previous rating (Wikipedia). The dataset reports more than one million of ratings of netflix titles over the year. We perform descriptive statistics and create different data visualisation. It is an exploratory study. Our goal is to answer simple questions concerning the data, i.e. : 
# - how ratings are overall distributed? and how are they distributed over the years? 
# - which are the most liked movies in the database? 
# - how rating interact with movie genre? 
# - can we get an idea of rating habits of the customers depending on their activity level?
# 
# Materials: 
# The netflix database was divided in 4 .txt files containing a first row with movie id and under movie id, customer id, rating, and date. Because of the size of the database, we created a random sample of 500.000 lines in the script sampling.py, adjusting also the structure of the data to have the following columns: movie id, customer id, rating, date. We added also a file containing the movie titles and year of production, and a file containing movie genres. This database has been chosen for the richness of its data and for being a well-known and widely used database.
# 
# Methods: 
# - In the DATA MANAGEMENT part of the notebook, we 1) load the data and merge the dfs to create the final dataframe; 2) explore the dataset and deal with missing data; 3) create the new variables
# - In the DATA EPLORATION part, we perform descriptive statistsics 
# - In the DATA VISUALIZATION part, we created the different plots 
# 
# 

In [None]:


import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# DATA MANAGEMENT PART 
# 

# Step 1: load ratings data
# 

In [None]:


df_ratings = pd.read_csv("netflix_sampled_500k_proportional.csv")


df_ratings


# Step 2. Load and join the movie titles file
# 

In [None]:


#Read and clean the movie titles file 
titles_rows = []
    
with open('movie_titles.csv', 'r', encoding = 'latin-1') as f: 
    for line in f: 
      line = line.strip()
      parts = line.split(',', 2)

      if len(parts) == 3:
            film_id, year, title = parts
            titles_rows.append([int(film_id), year, title])

df_titles = pd.DataFrame(titles_rows, columns=['movie_id', 'year', 'title'])

In [None]:


df_titles



In [None]:


#merge the movie titles file with the rating file

df_ratings_titles   = pd.merge(df_ratings, df_titles, how = 'left', on = 'movie_id')



df_ratings_titles


# Step 3. Load and join the genres file

In [None]:


#read the genre file
df_genres = pd.read_csv('netflix_genres.csv')

df_genres

In [None]:


#merge the df with the genre file 

df = pd.merge(df_ratings_titles, df_genres, how = 'left', left_on = 'movie_id', right_on = 'movieId').drop('movieId', axis = 1)

df


# Step 4. Data exploration, missing data treatment and variable creation 
# 

In [None]:


### Première choses dans l'analyse exploratoire: Connaître le Dataframe

df.info()

In [None]:


### change type for the year and date variables which are strings 

df["date"] = pd.to_datetime(df["date"])
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df.info() ### On voit bien que maintenant tout est dans le bon format

In [None]:


#check for missing data 
df.isnull().sum()

In [None]:


#we have missing data for the year so let's see what they are 
df.loc[(df['year'].isna())]



In [None]:


#let's drop them for now (@Paulo being only three titles we could fill them with the real values, opinions? )
# Paulo: Yes, its only 3 titles in a 500 000 rows dataset
df = df.dropna(subset={"year"})

In [None]:


#check the na again 

df.isnull().sum()

#year is good now 

In [None]:


#we have missing data in the genre column so let's visualize them 
df.loc[(df['genres'].isna())]




In [None]:


# fill the NaN with 'Empty' (@Paulo: for now, tell me if you have other ideas for missing values)
# Lets check:

lenght_before = len(df)

df = df.dropna(subset='genres')

lenght_after = len(df)

data_loss = round((((lenght_after - lenght_before) / lenght_before) * 100), 2)

print(f'Data Loss (%) = {data_loss}')

# Paulo 12/12: We lose just 10% data for a cleaner dataset, its a raisonable trade-off



In [None]:


df

In [None]:


# i want to see how many of these are tv series and not movies, documentaries etc
 

#filter the titles that contain 'season '. This is the most common structure of the title when it is a series but some others could be left out, let's just have a look
series = df['title'].str.contains('Season ')



print(series.sum())


df[series]




# Variable creation

In [None]:


#let's start with the new variables 

#the first column we add the decade
# it is a conditional column: if 1980<=year=>1989 -> 1980s 


#let's check the min and max 
df.describe()

In [None]:


###on voit que year min = 1896 et max= 2005 donc
conditions = [
    (df["year"] >= 1890) & (df["year"] <= 1899),
    (df["year"] >= 1900) & (df["year"] <= 1909),
    (df["year"] >= 1910) & (df["year"] <= 1919),
    (df["year"] >= 1920) & (df["year"] <= 1929),
    (df["year"] >= 1930) & (df["year"] <= 1939),
    (df["year"] >= 1940) & (df["year"] <= 1949),
    (df["year"] >= 1950) & (df["year"] <= 1959),
    (df["year"] >= 1960) & (df["year"] <= 1969),
    (df["year"] >= 1970) & (df["year"] <= 1979),
    (df["year"] >= 1980) & (df["year"] <= 1989),
    (df["year"] >= 1990) & (df["year"] <= 1999),
    (df["year"] >= 2000) & (df["year"] <= 2005)
]

values = [
    "1890s",
    "1900s",
    "1910s",
    "1920s",
    "1930s",
    "1940s",
    "1950s",
    "1960s",
    "1970s",
    "1980s",
    "1990s",
    "2000s"
]

df["decade"] = np.select(conditions, values, default="Out of Range")

In [None]:


df

In [None]:


#add another variable: rating category 

# Define the conditions for each category
rating_conditions = [
    (df['rating'] <= 2),
    (df['rating'] == 3),
    (df['rating'] >= 4)
]

# Define the corresponding values for each category
categories = ['Low', 'Neutral', 'High']

# Create the new column using np.select
df['rating_category'] = np.select(rating_conditions, categories, default='Unknown')



In [None]:


#add another variable: divide users based on activity levels 


#we create a df customer stats to group by customers and have the number of ratings per customer 


customer_stats = df.groupby('customer_id').agg(
    # 1. Calculate the number of ratings (Rating Volume)
    num_ratings=('rating', 'count'),
    # 2. Calculate the average rating (Rating Tendency)
    avg_rating=('rating', 'mean')
).reset_index()

# Step 1: We try to divide the customers based on quintiles and check how many categories have been created 
cut_results, bins = pd.qcut(
    customer_stats['num_ratings'],
    q=5, 
    labels=False,  # Temporarily prevents the error
    duplicates='drop',
    retbins=True 
)

num_bins_created = len(bins) - 1
print(f"You have {num_bins_created} unique bins.")

# based on the number of created bins we create the categories 

customer_stats['activity_level'] = pd.qcut(
    customer_stats['num_ratings'],
    q=5, # Request 5 groups, but pandas only makes 3 due to duplicates='drop'
    labels=['Low', 'Medium', 'High'], # <-- Must use only 3 labels
    duplicates='drop' 
)


print(customer_stats['activity_level'].value_counts())


# Now we merge the new 'activity_level' column from customer_stats into the main df
df = pd.merge(
    df,
    customer_stats[['customer_id', 'activity_level']],
    on='customer_id',
    how='left' # Ensure all rows from df are kept
)

In [None]:


df


# Subsetting and groupings 

In [None]:


### Grouping of decades and rating 

df_view_decades = (
    df[["decade", "rating"]]
    .groupby("decade", as_index=False)
    .mean()
)

In [None]:


#Here we calculate a weighted rating depending on number of ratings per customer and obtain the 10 top rated movies 

# average movie rating and rating count 

movie_stats = df.groupby(['movie_id', 'title']).agg(
    rating_count=('rating', 'count'), 
    # This calculates the average rating (R) for the WR formula
    avg_rating=('rating', 'mean') 
).reset_index()


#calculate overall average rating 
C = df['rating'].mean()

# 2. Calculate m (Minimum Votes Threshold - here we use the 90th percentile) 
m = movie_stats['rating_count'].quantile(0.90)




# Rename columns for the calculation of the weighted rating 
v = movie_stats['rating_count']
R = movie_stats['avg_rating']

# Calculate WR using the constants C and m:
movie_stats['weighted_rating'] = (
    (v / (v + m)) * R 
) + (
    (m / (v + m)) * C
)

#define the movies that have higher rating count than the threshold 
eligible_movies = movie_stats[movie_stats['rating_count'] >= m].copy()

# 2. Sort by the Weighted Rating (WR) and take the first 10 movies 
top_movies = eligible_movies.sort_values(by='weighted_rating', ascending=False).head(10)

print(top_movies)

In [None]:


#save all data to use in app.py 

output_dir = 'data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


df.to_csv(os.path.join(output_dir, 'main_df.csv'), index=False)
movie_stats.to_csv(os.path.join(output_dir, 'movies_by_rating.csv'), index=False)