#  Data Science Learning Journey  
*Curiosity to Capability — One Notebook at a Time*

---
Compiled and authored by **Partho Sarothi Das**   
	Dhaka, Bangladesh  
	Bachelor's & Master's in Statistics  
	Investment Banking Professional → Aspiring Data Scientist 
    
---

In [2]:
import pandas as pd

df = pd.read_csv('data/imdb-top-1000.csv')
df.head(4)

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0


### Basic Grouping

In [4]:
# 1. Group the movies by Genre and find the average IMDb rating for each genre.

genre = df.groupby('Genre')
genre['IMDB_Rating'].mean().reset_index()

Unnamed: 0,Genre,IMDB_Rating
0,Action,7.949419
1,Adventure,7.9375
2,Animation,7.930488
3,Biography,7.938636
4,Comedy,7.90129
5,Crime,8.016822
6,Drama,7.957439
7,Family,7.8
8,Fantasy,8.0
9,Film-Noir,7.966667


In [5]:
# 2. Group by Director and count how many movies each director has in the dataset.
df.groupby('Director')['Director'].count()

# Alternative: 
df.groupby('Director').size()

Director
Aamir Khan             1
Aaron Sorkin           1
Abdellatif Kechiche    1
Abhishek Chaubey       1
Abhishek Kapoor        1
                      ..
Zack Snyder            2
Zaza Urushadze         1
Zoya Akhtar            2
Çagan Irmak            1
Ömer Faruk Sorak       1
Length: 548, dtype: int64

In [6]:
# 3. Group by Year and find the total number of movies released per year.

df.groupby('Released_Year')['Released_Year'].count()

Released_Year
1920     1
1921     1
1922     1
1924     1
1925     2
        ..
2017    22
2018    19
2019    23
2020     6
PG       1
Name: Released_Year, Length: 100, dtype: int64

In [7]:
# Without groupby
df['Released_Year'].value_counts()

Released_Year
2014    32
2004    31
2009    29
2013    28
2016    28
        ..
1926     1
1936     1
1924     1
1921     1
PG       1
Name: count, Length: 100, dtype: int64

### Aggregation and Statistics

In [9]:
# 4. Group by Genre and find the maximum and minimum duration of movies in each genre.

df.groupby('Genre')['Runtime'].agg(['max','min'])

Unnamed: 0_level_0,max,min
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,321,45
Adventure,228,88
Animation,137,71
Biography,209,93
Comedy,188,68
Crime,229,80
Drama,242,64
Family,115,100
Fantasy,94,76
Film-Noir,108,100


In [10]:
#5. Group by Director and calculate the average Gross revenue for each director.

df.groupby('Director')['Gross'].agg(['mean'])

Unnamed: 0_level_0,mean
Director,Unnamed: 1_level_1
Aamir Khan,1223869.0
Aaron Sorkin,853090410.0
Abdellatif Kechiche,2199675.0
Abhishek Chaubey,218428303.0
Abhishek Kapoor,1122527.0
...,...
Zack Snyder,159062369.0
Zaza Urushadze,144501.0
Zoya Akhtar,4337509.5
Çagan Irmak,461855363.0


In [11]:
# 6. Find the top 3 genres with the highest average IMDB_Rating.

df.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending=False).head(3)

Genre
Western    8.350000
Crime      8.016822
Fantasy    8.000000
Name: IMDB_Rating, dtype: float64

### Filtering and Conditions

In [13]:
# 7. Group by Actor 1 and find actors who have appeared in more than 5 movies in the dataset.

df1 = df.groupby('Star1').size().reset_index(name='movie_count')
df1[df1['movie_count'] > 5]

Unnamed: 0,Star1,movie_count
0,Aamir Khan,7
9,Al Pacino,10
104,Cary Grant,6
111,Charles Chaplin,6
128,Christian Bale,8
135,Clint Eastwood,10
148,Daniel Radcliffe,6
165,Denzel Washington,7
200,Ethan Coen,6
262,Humphrey Bogart,9


In [14]:
# 8. Filter the dataset to find the Directors whose movies have an average IMDb rating above 8.0.

df2 = df.groupby('Director')['IMDB_Rating'].mean().reset_index()
df2[df2['IMDB_Rating'] > 8]

Unnamed: 0,Director,IMDB_Rating
0,Aamir Khan,8.40
5,Adam Elliot,8.10
7,Aditya Chopra,8.10
8,Aditya Dhar,8.20
9,Akira Kurosawa,8.22
...,...,...
537,Yavuz Turgul,8.20
542,Yôjirô Takita,8.10
544,Zaza Urushadze,8.20
545,Zoya Akhtar,8.05


In [15]:
# 9. Group by Year and find the years where the average gross revenue was higher than $100 million.

df3 = df.groupby('Released_Year')['Gross'].mean().reset_index()
df3[df3['Gross'] > 100000000]

Unnamed: 0,Released_Year,Gross
0,1920,337574700.0
2,1922,445152000.0
7,1928,129861800.0
10,1932,219283000.0
11,1933,219662900.0
13,1935,103228400.0
16,1938,210141000.0
17,1939,225637400.0
20,1942,157930800.0
21,1943,123353300.0


### Advanced Grouping

In [17]:
# 10. Group by both Genre and Year and find the total number of movies released for each genre per year.
# Code 1
df.groupby(['Genre','Released_Year']).size().reset_index(name='movie_count')

Unnamed: 0,Genre,Released_Year,movie_count
0,Action,1924,1
1,Action,1926,1
2,Action,1932,1
3,Action,1938,1
4,Action,1948,2
...,...,...,...
429,Thriller,1967,1
430,Western,1965,1
431,Western,1966,1
432,Western,1968,1


In [18]:
# Code 2

df.groupby(['Genre','Released_Year'])['Series_Title'].count().reset_index(name='movie_count')

Unnamed: 0,Genre,Released_Year,movie_count
0,Action,1924,1
1,Action,1926,1
2,Action,1932,1
3,Action,1938,1
4,Action,1948,2
...,...,...,...
429,Thriller,1967,1
430,Western,1965,1
431,Western,1966,1
432,Western,1968,1


In [19]:
# 11. Group by Genre and find the percentage of movies in each genre compared to the total dataset.

total_movies = df.shape[0]
(df.groupby('Genre')['Genre'].value_counts())*100/total_movies

Genre
Action       17.2
Adventure     7.2
Animation     8.2
Biography     8.8
Comedy       15.5
Crime        10.7
Drama        28.9
Family        0.2
Fantasy       0.2
Film-Noir     0.3
Horror        1.1
Mystery       1.2
Thriller      0.1
Western       0.4
Name: count, dtype: float64

In [20]:
# 12. Group by Director and find the director with the highest total gross revenue.

df.groupby('Director')['Gross'].sum().sort_values(ascending=False).index[0]

'Akira Kurosawa'

###  Time Series Grouping (If Release Year is a Date Field)

In [22]:
# 13. Group movies by decade and find the average IMDb rating for each decade.

df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors = 'coerce')
df['Released_Year'].dropna()
df['Decade'] = (df['Released_Year']//10)*10
df.groupby('Decade')['IMDB_Rating'].mean()

Decade
1920.0    8.127273
1930.0    7.966667
1940.0    8.025714
1950.0    8.058929
1960.0    7.973973
1970.0    7.969737
1980.0    7.953933
1990.0    7.963333
2000.0    7.899578
2010.0    7.924380
2020.0    8.133333
Name: IMDB_Rating, dtype: float64

In [23]:
# 14. Group by Year and find the trend of average movie duration over time.

# Ensure the 'Year' column is correctly interpreted as an integer
df['Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')

# Drop rows with missing or invalid year values
df = df.dropna(subset=['Year'])

# Group by 'Year' and calculate the average movie duration
average_duration_by_year = df.groupby('Year')['Runtime'].mean()

average_duration_by_year

Year
1920.0     76.000000
1921.0     68.000000
1922.0     94.000000
1924.0     45.000000
1925.0     85.000000
             ...    
2016.0    123.642857
2017.0    121.590909
2018.0    128.105263
2019.0    132.130435
2020.0    126.666667
Name: Runtime, Length: 99, dtype: float64

In [24]:
delivery = pd.read_csv('data/deliveries.csv')
delivery.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,


In [25]:
# find the batsman with max no of sixes
delivery[delivery['batsman_runs'] == 6]['batsman'].value_counts().index[0]

'CH Gayle'

In [26]:
# find batsman with most number of 4's and 6's in last 5 overs
## process 1
df = delivery[(delivery['over'] > 15) & ((delivery['batsman_runs'] == 4) | (delivery['batsman_runs'] == 6))]
df['batsman'].value_counts().index[0]

'MS Dhoni'

In [27]:
# Process 2
df = df[df['over'] > 15]
df[(df['batsman_runs'] == 4) | (df['batsman_runs']== 6)]['batsman'].value_counts().index[0]

'MS Dhoni'

In [28]:
# find V Kohli's record against all teams
df = delivery[delivery['batsman'] == 'V Kohli']
df.groupby('bowling_team')['batsman_runs'].sum().reset_index()

Unnamed: 0,bowling_team,batsman_runs
0,Chennai Super Kings,706
1,Deccan Chargers,306
2,Delhi Daredevils,661
3,Gujarat Lions,283
4,Kings XI Punjab,483
5,Kochi Tuskers Kerala,50
6,Kolkata Knight Riders,391
7,Mumbai Indians,447
8,Pune Warriors,128
9,Rajasthan Royals,258


In [29]:
# Create a function that can return the highest score of any batsman
df = delivery[delivery['batsman'] == 'DA Warner']
df.groupby('match_id')['batsman_runs'].sum().sort_values(ascending=False).head(1).values[0]

126

In [30]:
def highscore(batsman):
    df = delivery[delivery['batsman'] == batsman]
    return df.groupby('match_id')['batsman_runs'].sum().sort_values(ascending=False).head(1).values[0]

highscore('NL McCullum')

15