#   Oscar Awards Analysis Project (working with my group)

# Part 1: Oscar Awards dataset ETL Procedures & Analysis

In [1]:
import pandas as pd
import numpy as np

# read in the the Oscar awards .csv indexed by year_film.
df = pd.read_csv('the_oscar_award.csv', index_col='year_film')   
                
df.head(5)

Unnamed: 0_level_0,year_ceremony,ceremony,category,name,film,winner
year_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [2]:
# display the last 5 records
df.tail(5)

Unnamed: 0_level_0,year_ceremony,ceremony,category,name,film,winner
year_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022,2023,95,Writing (Original Screenplay),Written by Martin McDonagh,The Banshees of Inisherin,
2022,2023,95,Writing (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,
2022,2023,95,Writing (Original Screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,
2022,2023,95,Writing (Original Screenplay),Written by Todd Field,Tár,
2022,2023,95,Writing (Original Screenplay),Written by Ruben Östlund,Triangle of Sadness,


In [3]:
# describe the data, it contains 95 years of Oscar Awards.
df.describe()

Unnamed: 0,year_ceremony,ceremony
count,10759.0,10759.0
mean,1977.04861,49.073334
std,27.041631,26.999754
min,1928.0,1.0
25%,1953.0,25.0
50%,1976.0,48.0
75%,2001.0,73.0
max,2023.0,95.0


In [4]:
# Get the info of the complete Oscar dataset records. 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10759 entries, 1927 to 2022
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_ceremony  10759 non-null  int64 
 1   ceremony       10759 non-null  int64 
 2   category       10759 non-null  object
 3   name           10756 non-null  object
 4   film           10449 non-null  object
 5   winner         10639 non-null  object
dtypes: int64(2), object(4)
memory usage: 588.4+ KB


In [5]:
# Locate the 430 missing values and see where they are coming from. 
# For the 310 film=NaN, these include categories such as Humanitarian, Writing and Engineering awards, 
# where no film was involved.
# for the 120 winner=NaN, the results from 2022 were not included. These rows can be dropped.

df.isnull().sum()

year_ceremony      0
ceremony           0
category           0
name               3
film             310
winner           120
dtype: int64

In [6]:
# display the winners where there is no film
df[df.isna().any(axis=1)]

Unnamed: 0_level_0,year_ceremony,ceremony,category,name,film,winner
year_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1927,1928,1,ENGINEERING EFFECTS,Ralph Hammeras,,False
1927,1928,1,ENGINEERING EFFECTS,Nugent Slaughter,,False
1927,1928,1,WRITING (Title Writing),Joseph Farnham,,True
1927,1928,1,WRITING (Title Writing),"George Marion, Jr.",,False
1927,1928,1,SPECIAL AWARD,Warner Bros.,,True
...,...,...,...,...,...,...
2022,2023,95,Writing (Original Screenplay),Written by Martin McDonagh,The Banshees of Inisherin,
2022,2023,95,Writing (Original Screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,
2022,2023,95,Writing (Original Screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,
2022,2023,95,Writing (Original Screenplay),Written by Todd Field,Tár,


In [7]:
# Display the info before the columns are dropped.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10759 entries, 1927 to 2022
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_ceremony  10759 non-null  int64 
 1   ceremony       10759 non-null  int64 
 2   category       10759 non-null  object
 3   name           10756 non-null  object
 4   film           10449 non-null  object
 5   winner         10639 non-null  object
dtypes: int64(2), object(4)
memory usage: 588.4+ KB


In [8]:
# since there is no film related to certain awards, and 2022 columns had not been updated yet, these 430 rows will be dropped.
# file now conatins 10,329 records
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10329 entries, 1927 to 2021
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_ceremony  10329 non-null  int64 
 1   ceremony       10329 non-null  int64 
 2   category       10329 non-null  object
 3   name           10329 non-null  object
 4   film           10329 non-null  object
 5   winner         10329 non-null  object
dtypes: int64(2), object(4)
memory usage: 564.9+ KB


In [9]:
# Create a new attribute: film_and_year
df['filmy'] = df["film"] + " (" + df["year_ceremony"].astype(str) + ")"
df.head()

Unnamed: 0_level_0,year_ceremony,ceremony,category,name,film,winner,filmy
year_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False,The Noose (1928)
1927,1928,1,ACTOR,Emil Jannings,The Last Command,True,The Last Command (1928)
1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False,A Ship Comes In (1928)
1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True,7th Heaven (1928)
1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False,Sadie Thompson (1928)


In [10]:
# First we need to use the Groupby to split the file in two based on "winner", winner = T or F....1 or 0.
df_outcome = df.groupby('winner')

In [11]:
# Create the new vaiable "dfw" of those where winner=1....the winners
dfw=df_outcome.get_group(1)
dfw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2125 entries, 1927 to 2021
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_ceremony  2125 non-null   int64 
 1   ceremony       2125 non-null   int64 
 2   category       2125 non-null   object
 3   name           2125 non-null   object
 4   film           2125 non-null   object
 5   winner         2125 non-null   object
 6   filmy          2125 non-null   object
dtypes: int64(2), object(5)
memory usage: 132.8+ KB


# 1. Do the Academy Awards reflect the diversity of foreign films or are the #OscarsSoWhite? 

In [12]:
# there were 63 out of 95 years with Foreign Language Film (FLM). 66% of the years had a FLM.
dfw_flm=dfw[(dfw['category']=='FOREIGN LANGUAGE FILM')]
dfw_flm.head()

Unnamed: 0_level_0,year_ceremony,ceremony,category,name,film,winner,filmy
year_film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1956,1957,29,FOREIGN LANGUAGE FILM,"Italy; Dino De Laurentiis and Carlo Ponti, Pro...",La Strada,True,La Strada (1957)
1957,1958,30,FOREIGN LANGUAGE FILM,Italy,The Nights of Cabiria,True,The Nights of Cabiria (1958)
1958,1959,31,FOREIGN LANGUAGE FILM,France,My Uncle,True,My Uncle (1959)
1959,1960,32,FOREIGN LANGUAGE FILM,France,Black Orpheus,True,Black Orpheus (1960)
1960,1961,33,FOREIGN LANGUAGE FILM,Sweden,The Virgin Spring,True,The Virgin Spring (1961)


In [13]:
# display the winners of Foreign Language Film movies (FlM). 
# There were 63 FLM out of 95 years (66%) of these. It looks well distributed.
dfw_flm.groupby(['name']).count().sort_values('winner',ascending=False)

Unnamed: 0_level_0,year_ceremony,ceremony,category,film,winner,filmy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Italy,10,10,10,10,10,10
France,9,9,9,9,9,9
Spain,4,4,4,4,4,4
Denmark,3,3,3,3,3,3
The Netherlands,3,3,3,3,3,3
Sweden,3,3,3,3,3,3
Union of Soviet Socialist Republics,3,3,3,3,3,3
Czechoslovakia,2,2,2,2,2,2
Austria,2,2,2,2,2,2
Germany,2,2,2,2,2,2


In [14]:
# display the count of winners of Foreign Language Film (dfw_flm). 
# There were 63 FLM out of 95 years (66%) of these. It looks well distributed.
dfw_flm.groupby(['name']).count().sort_values('winner',ascending=False).sum()

year_ceremony    63
ceremony         63
category         63
film             63
winner           63
filmy            63
dtype: int64

# 2. Which actor/actress has received the most awards overall or in a single year?

In [15]:
# only want records where the category is ACTOR or ACTRESS. There are 468 records.
df=df[df['category'].isin(['ACTOR',"ACTRESS"])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 468 entries, 1927 to 1975
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_ceremony  468 non-null    int64 
 1   ceremony       468 non-null    int64 
 2   category       468 non-null    object
 3   name           468 non-null    object
 4   film           468 non-null    object
 5   winner         468 non-null    object
 6   filmy          468 non-null    object
dtypes: int64(2), object(5)
memory usage: 29.2+ KB


In [16]:
# Summarize the awards/nominations of Actor/Actress winners for all 94 years. 
# Katherine Hepburn is the winner with 3.
name_oscars=df[['name','winner']].groupby(by='name').sum().sort_values(by='winner',ascending=False)
name_oscars['Awards']=name_oscars['winner'].apply(lambda x:int(x))
del name_oscars['winner']
name_oscars['Nominations']=df[['name','winner']].groupby(by='name').count()

print(name_oscars.head(15))

                     Awards  Nominations
name                                    
Katharine Hepburn         3           11
Gary Cooper               2            5
Marlon Brando             2            7
Ingrid Bergman            2            5
Olivia de Havilland       2            4
Bette Davis               2           11
Glenda Jackson            2            4
Vivien Leigh              2            2
Spencer Tracy             2            9
Elizabeth Taylor          2            5
Luise Rainer              2            2
Fredric March             2            5
Gene Hackman              1            1
Grace Kelly               1            1
Norma Shearer             1            6
