# Introduction
Update: In the analysis are now the data from the oscars 2024 included. Biggest change is now the inclusion of 'Oppenheimer' in a few - who has the most nominations and wins list.

# Load Dataset
Load the dataset using pandas and display the first few rows.

In [1]:
# Load Dataset
# %pip install pandas
# %pip install --upgrade pip

import pandas as pd

# Load the dataset
oscars = pd.read_csv('the_oscar_award.csv')

# Display the first few rows
oscars.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


# Explore Dataset
Use pandas profiling or similar tools to explore the dataset and understand its structure.

In [8]:
# Explore Dataset
# %pip install dtale

import dtale

# Generate a profile report
d = dtale.show(oscars)
d  # This will display the dtale interface in your notebook



# Top Movies by Nominations
Analyze and display the top 20 movies with the most Oscar nominations.

In [28]:
# Top Movies by Nominations
top_movies_nominations = oscars.groupby(['film', 'year_ceremony']).agg(
    nominations=('film', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

top_movies_nominations['percentage_won'] = round(
    (top_movies_nominations['oscars_won'] / top_movies_nominations['nominations']) * 100, 1
)

top_movies_nominations = top_movies_nominations.sort_values(
    by=['nominations', 'oscars_won'], ascending=[False, False]
).head(20)

top_movies_nominations

Unnamed: 0,film,year_ceremony,nominations,oscars_won,percentage_won
4734,Titanic,1998,14,11,78.6
254,All about Eve,1951,14,6,42.9
2035,La La Land,2017,14,6,42.9
1388,From Here to Eternity,1954,13,8,61.5
1486,Gone with the Wind,1940,13,8,61.5
2676,Oppenheimer,2024,13,7,53.8
3122,Shakespeare in Love,1999,13,7,53.8
817,Chicago,2003,13,6,46.2
1353,Forrest Gump,1995,13,6,46.2
2295,Mary Poppins,1965,13,5,38.5


# Top Movies by Wins
Analyze and display the top 20 movies with the most Oscar wins.

In [27]:
# Top Movies by Nominations
top_movies_nominations = oscars.groupby(['film', 'year_ceremony']).agg(
    nominations=('film', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

top_movies_nominations['percentage_won'] = round(
    (top_movies_nominations['oscars_won'] / top_movies_nominations['nominations']) * 100, 1
)

top_movies_nominations = top_movies_nominations.sort_values(
    by=['nominations', 'oscars_won'], ascending=[False, False]
).head(20)

top_movies_nominations

top_movies_percentage = top_movies_nominations.sort_values(
    by=['percentage_won', 'nominations'], ascending=[False, False]
).head(10)

top_movies_percentage

Unnamed: 0,film,year_ceremony,nominations,oscars_won,percentage_won
507,Ben-Hur,1960,12,11,91.7
4734,Titanic,1998,14,11,78.6
3806,The English Patient,1997,12,9,75.0
2480,My Fair Lady,1965,12,8,66.7
2634,On the Waterfront,1955,12,8,66.7
1388,From Here to Eternity,1954,13,8,61.5
1486,Gone with the Wind,1940,13,8,61.5
980,Dances With Wolves,1991,12,7,58.3
3064,Schindler's List,1994,12,7,58.3
2676,Oppenheimer,2024,13,7,53.8


# Top Movies by Win Percentage
Analyze and display the top 20 movies with the highest win percentage.

In [26]:
# Top Movies by Win Percentage
top_movies_win_percentage = oscars.groupby(['film', 'year_ceremony']).agg(
    nominations=('film', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

top_movies_win_percentage['percentage_won'] = round(
    (top_movies_win_percentage['oscars_won'] / top_movies_win_percentage['nominations']) * 100, 1
)

top_movies_win_percentage = top_movies_win_percentage.sort_values(
    by=['percentage_won', 'nominations'], ascending=[False, False]
).head(20)

top_movies_win_percentage

Unnamed: 0,film,year_ceremony,nominations,oscars_won,percentage_won
4120,The Lord of the Rings: The Return of the King,2004,11,11,100.0
1443,Gigi,1959,9,9,100.0
4062,The Last Emperor,1988,9,9,100.0
1851,It Happened One Night,1935,5,5,100.0
4176,The Matrix,2000,4,4,100.0
721,CODA,2022,3,3,100.0
1509,Grand Prix,1967,3,3,100.0
1944,Jurassic Park,1994,3,3,100.0
3612,The Bourne Ultimatum,2008,3,3,100.0
307,An Inconvenient Truth,2007,2,2,100.0


# Most Nominations Per Year
Analyze and display the movies with the most nominations per year.

In [21]:
# Most Nominations Per Year
most_nominations_per_year = oscars.groupby(['year_ceremony', 'film']).size().reset_index(name='nominations')

most_nominations_per_year = most_nominations_per_year.loc[
    most_nominations_per_year.groupby('year_ceremony')['nominations'].idxmax()
]

most_nominations_per_year

Unnamed: 0,year_ceremony,film,nominations
0,1928,7th Heaven,5
39,1929,The Patriot,5
62,1930,The Love Parade,5
69,1931,Cimarron,7
90,1932,Arrowsmith,4
...,...,...,...
4957,2021,Mank,10
5033,2022,The Power of the Dog,12
5061,2023,Everything Everywhere All at Once,11
5126,2024,Oppenheimer,13


# Most Wins Per Year
Analyze and display the movies with the most wins per year.

In [22]:
# Most Wins Per Year
most_wins_per_year = oscars[oscars['winner'] == True].groupby(['year_ceremony', 'film']).size().reset_index(name='wins')

most_wins_per_year = most_wins_per_year.loc[
    most_wins_per_year.groupby('year_ceremony')['wins'].idxmax()
]

most_wins_per_year

Unnamed: 0,year_ceremony,film,wins
0,1928,7th Heaven,3
7,1929,Coquette,1
14,1930,All Quiet on the Western Front,2
21,1931,Cimarron,3
26,1932,Bad Girl,2
...,...,...,...
1294,2021,Nomadland,3
1305,2022,Dune,6
1320,2023,Everything Everywhere All at Once,7
1335,2024,Oppenheimer,7


# Movies with the Least Wins
Analyze and display the movies with the least wins despite high nominations.

In [24]:
# Movies with the Least Wins
least_wins = oscars.groupby(['film', 'year_ceremony']).agg(
    nominations=('film', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

least_wins['percentage_won'] = round(
    (least_wins['oscars_won'] / least_wins['nominations']) * 100, 1
)

least_wins = least_wins.sort_values(
    by=['percentage_won', 'nominations'], ascending=[True, False]
).head(20)

least_wins

Unnamed: 0,film,year_ceremony,nominations,oscars_won,percentage_won
3684,The Color Purple,1986,11,0,0.0
4569,The Turning Point,1978,11,0,0.0
286,American Hustle,2014,10,0,0.0
1410,Gangs of New York,2003,10,0,0.0
1969,Killers of the Flower Moon,2024,10,0,0.0
4018,The Irishman,2020,10,0,0.0
4825,True Grit,2011,10,0,0.0
2763,Peyton Place,1958,9,0,0.0
3548,The Banshees of Inisherin,2023,9,0,0.0
4097,The Little Foxes,1942,9,0,0.0


# Oscar Categories Analysis
Analyze the number of categories over the years and their evolution.

In [30]:
# Oscar Categories Analysis
categories_over_time = oscars.groupby('category').size().reset_index(name='count')

categories_over_time = categories_over_time.sort_values(by='count', ascending=False)

categories_over_time

Unnamed: 0,category,count
33,DIRECTING,469
42,FILM EDITING,450
2,ACTOR IN A SUPPORTING ROLE,440
5,ACTRESS IN A SUPPORTING ROLE,440
38,DOCUMENTARY (Short Subject),378
...,...,...
98,SPECIAL ACHIEVEMENT AWARD (Sound Effects),1
96,SPECIAL ACHIEVEMENT AWARD (Sound Editing),1
44,GORDON E. SAWYER AWARD,1
20,Best Documentary Short Film,1


# Actor/Actress Nominations
Analyze and display the actors/actresses with the most nominations.

In [31]:
# Actor/Actress Nominations
actor_nominations = oscars[oscars['category'].str.contains('ACTOR|ACTRESS', case=False)].groupby('name').agg(
    nominations=('name', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

actor_nominations = actor_nominations.sort_values(
    by=['nominations', 'oscars_won'], ascending=[False, False]
).head(20)

actor_nominations

Unnamed: 0,name,nominations,oscars_won
681,Meryl Streep,21,3
531,Katharine Hepburn,12,4
397,Jack Nicholson,12,3
93,Bette Davis,11,2
218,Denzel Washington,9,2
892,Spencer Tracy,9,2
8,Al Pacino,9,1
568,Laurence Olivier,9,1
752,Paul Newman,9,1
141,Cate Blanchett,8,2


# Actor/Actress Wins
Analyze and display the actors/actresses with the most wins.

In [32]:
# Actor/Actress Wins
actor_wins = oscars[oscars['category'].str.contains('ACTOR|ACTRESS', case=False)].groupby('name').agg(
    nominations=('name', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

actor_wins = actor_wins.sort_values(
    by=['oscars_won', 'nominations'], ascending=[False, False]
).head(20)

actor_wins

Unnamed: 0,name,nominations,oscars_won
531,Katharine Hepburn,12,4
681,Meryl Streep,21,3
397,Jack Nicholson,12,3
386,Ingrid Bergman,7,3
294,Frances McDormand,6,3
969,Walter Brennan,4,3
93,Bette Davis,11,2
218,Denzel Washington,9,2
892,Spencer Tracy,9,2
141,Cate Blanchett,8,2


# Directors Nominations and wins
Analyze and display the directors with the most nominations and wins.

In [35]:
# Director Nominations
director_nominations = oscars[oscars['category'].str.contains('DIRECTING (Comedy Picture)|DIRECTING (Dramatic Picture)|DIRECTING|Best Director', case=False)].groupby('name').agg(
    nominations=('name', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

director_nominations = director_nominations.sort_values(
    by=['nominations', 'oscars_won'], ascending=[False, False]
).head(20)

director_nominations


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



Unnamed: 0,name,nominations,oscars_won
253,William Wyler,12,3
152,Martin Scorsese,10,1
227,Steven Spielberg,9,2
24,Billy Wilder,8,2
47,David Lean,7,2
68,Fred Zinnemann,7,2
255,Woody Allen,7,1
64,Frank Capra,6,3
113,John Ford,5,4
54,Elia Kazan,5,2


In [36]:
# directors Wins
director_wins = oscars[oscars['category'].str.contains('DIRECTING (Comedy Picture)|DIRECTING (Dramatic Picture)|DIRECTING|Best Director', case=False)].groupby('name').agg(
    nominations=('name', 'count'),
    oscars_won=('winner', lambda x: x.sum() if x.dtype == 'bool' else 0)
).reset_index()

director_wins = director_wins.sort_values(
    by=['oscars_won', 'nominations'], ascending=[False, False]
).head(20)

director_wins


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



Unnamed: 0,name,nominations,oscars_won
113,John Ford,5,4
253,William Wyler,12,3
64,Frank Capra,6,3
227,Steven Spielberg,9,2
24,Billy Wilder,8,2
47,David Lean,7,2
68,Fred Zinnemann,7,2
54,Elia Kazan,5,2
75,George Stevens,5,2
38,Clint Eastwood,4,2
