# Practice GroupBy operations with Netflix data

In [None]:
### Importing the libraries

import pandas as pd

## Reading the dataset as a dataframe

# Reading dataframe
titles_df = pd.read_csv("titles.csv")

titles_df

## Discovering the dataset 

# Show the number of movies and shows
print("Number of movies and shows: ", titles_df.size)

# Show the first year Netflix produced a movie or a show
print("The first time Netflix  produced a movie or a show was in: ", titles_df['release_year'][0])

titles_df.dtypes

titles_df.isnull().sum()

##### 1. Drop records where the imdb_score column has missing values (NaN)

titles_df= titles_df.dropna(subset=["imdb_score"])

##### 2. For each TV show or movie which has NaN value in the age certification column, replace it to be `No certification`

titles_df['age_certification'] = titles_df['age_certification'].fillna("No certification")

titles_df['age_certification'] = titles_df['age_certification'].fillna("No certification")


##### 3. For each TV show or movie which has NaN value in the seasons column, replace it to be the most occured value in the seasons

most_common_season= titles_df['seasons'].mode()[0]

titles_df["seasons"]=titles_df["seasons"].fillna(most_common_season)

##### 4. Count the number of movies or TV shows for each age certification.

# Try your solution here
certification_counts = titles_df.groupby("age_certification").size().reset_index(name="count")
certification_counts

##### 5. Count the number of movies and TV shows (seperately) produced in each release year.

# Try your solution here
count_by_release_year = titles_df.groupby(["release_year","type"]).size().reset_index(name="count")

##### 6. Calculate the average runtime and imdb score of movies and TV shows for each release year. 

average_duration_imdb_score = titles_df.groupby("release_year").agg({"runtime":"mean","imdb_score":"mean"}).reset_index()
average_duration_imdb_score

average_duration_imdb_score = titles_df.groupby("release_year")[["runtime", "imdb_score"]].mean().reset_index()


##### 7. Count the number of movies and TV shows for each genre.

genre_counts = titles_df.groupby(["genres","type"]).size().reset_index(name="count")


new_titles_df = titles_df.copy()

Explode the genres column. 
The explode() function in pandas is used to transform a column with iterable elements 
(like lists, tuples, or sets) into multiple rows, with each row corresponding to one element from the iterable.

new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)


exploded_df = new_titles_df.explode('genres')

genre_counts = exploded_df.groupby('genres').size().reset_index(name='count')


##### 8. Calculate the standard deviation of movies and TV shows imdb ratings for each release year. 

imdb_score_std = titles_df.groupby("release_year")["imdb_score"].std().reset_index()

imdb_score_std

##### 9. Calculate the maximum TMDB popularity and minimum IMDb score for each production country

titles_df.columns

new_titles_df=titles_df.copy()

import ast
new_titles_df['production_countries'] = new_titles_df['production_countries'].apply(ast.literal_eval)

exploded_df = new_titles_df.explode('production_countries')


TMDB_popularity = exploded_df.groupby("production_countries").agg({"tmdb_popularity": "max", "imdb_score": "min"}).reset_index()


##### 10. Calculate the sum of IMDb votes for each genre and find the average TMDB score

titles_df.columns

import ast

new_titles_df = titles_df.copy()
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)

exploded_df = new_titles_df.explode('genres')

genres_votes_scores = exploded_df.groupby('genres').agg({
    'imdb_votes': 'sum',    # Calculate the sum of IMDb votes for each genre
    'tmdb_score': 'mean'    # Calculate the average TMDB score for each genre
}).reset_index()


##### 11. Calculate the average rating deviation from the mean for each genre (use custom defined function)



def rating_deviation(row):
    mean_rating = row['imdb_score'].mean()  # Calculate the mean rating
    deviation = row['imdb_score'] - mean_rating  # Calculate the deviation from the mean
    return deviation


new_titles_df = titles_df.copy()

import ast
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)

genre_avg_deviation = exploded_df.groupby('genres').apply(lambda x: rating_deviation(x).mean()).reset_index(name="average_deviation")


In [None]:
##### 12. Calculate the standardized score for TMDB popularity for each movie or TV show within its respective genre (use custom defined function)

STANDARDIZED SCORE HESAPLAMA:

def standardize_score(x):
    return (x - x.mean()) / x.std()

titles_df['standardized_tmdb_popularity'] = titles_df.groupby('genres')['tmdb_popularity'].transform(standardize_score)

##### 13. Find the minimum and maximum release year for each type (movie or TV show):

min_max_year = titles_df.groupby("type")["release_year"].agg(["min","max"])

min_max_year

##### 14. Calculate the average IMDb score and the max TMDB score for each genre and release year combination

new_titles_df = titles_df.copy()

import ast
new_titles_df['genres'] = new_titles_df['genres'].apply(ast.literal_eval)


exploded_df = new_titles_df.explode('genres')

genre_year_scores = exploded_df.groupby(['genres', 'release_year']).agg({'imdb_score': 'mean', 'tmdb_score': 'max'})


In [None]:
##### 15. Calculate the average length of titles (number of characters) for each genre  (use custom defined function)¶

titles_df.columns

def calculate_average_length(titles):
    lenghts= titles.apply(lambda x: len(x) if isinstance(x, str) else 0)
    return lenghts.mean()

new_titles_df= titles_df.copy()


import ast 
new_titles_df["genres"]=new_titles_df["genres"].apply(ast.literal_eval)

exploded_df=new_titles_df.explode("genres")

genre_average_length = exploded_df.groupby('genres')['title'].apply(calculate_average_length).reset_index()


In [None]:
##### 16. Find the count and average IMDb score for each age certification category (use custom defined function)

certification_stats = titles_df.groupby('age_certification').agg({'imdb_score': ['count', 'mean']})

certification_stats.columns = ['count', 'average_imdb_score']

### The End!