# Exploratory Data Analysis

In [5]:
import pandas as pd
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt

file_path = "../data/processed/transformed_data.csv"

df = pd.read_csv(file_path)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269731 entries, 0 to 269730
Data columns (total 16 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   id         269731 non-null  int64  
 1   name       269731 non-null  object 
 2   sex        269731 non-null  object 
 3   age        269731 non-null  float64
 4   height_cm  269731 non-null  float64
 5   weight_kg  269731 non-null  float64
 6   team       269731 non-null  object 
 7   noc        269731 non-null  object 
 8   games      269731 non-null  object 
 9   year       269731 non-null  int64  
 10  season     269731 non-null  object 
 11  city       269731 non-null  object 
 12  sport      269731 non-null  object 
 13  event      269731 non-null  object 
 14  medal      269731 non-null  object 
 15  country    269731 non-null  object 
dtypes: float64(3), int64(2), object(11)
memory usage: 32.9+ MB


In [7]:
# top_5_bottom_5_age_by_sport = pd.concat([age_by_sport.head(), age_by_sport.tail()])

# sns.barplot(top_5_bottom_5_age_by_sport)

In [8]:
df["year"].min()

np.int64(1896)

In [9]:
first_year_per_sport = df.groupby("sport").min("year")["year"]
last_year_per_sport = df.groupby("sport").max("year")["year"]
duration_per_sport = (last_year_per_sport - first_year_per_sport).sort_values(ascending=False)
duration_per_sport

# Some sports have only been played once
# Historic sports: 
    # Wrestling, Weightlifting, Shooting, Fencing Cycling, Gymnastics, Swimming, Athletics, etc
    # (Tennis not continuous, upon research)

# Idea: investigate Olympic record progression across historic sports 

sport
Wrestling        120
Weightlifting    120
Shooting         120
Tennis           120
Fencing          120
                ... 
Jeu de paume       0
Croquet            0
Cricket            0
Basque pelota      0
Aeronautics        0
Name: year, Length: 66, dtype: int64

In [10]:
years = df["year"].unique()

for year in range(1896, 2020, 4):
    if year not in years:
        print(f"Year missed: {year}")

for year in years:
    if year % 2 != 0:
        print(f"Odd year: {year}")

for i in range(1, len(years)):
    if years[i] - years[i - 1] != 4:
        print(f"Gap devation between: {years[i - 1]} and {years[i]}")

# The gap deviations are a mix of delayed/cancelled winter/summer Olympics and the eventual change of winter Olympics to 2 years apart from summer Olympics 

Year missed: 1916
Year missed: 1940
Year missed: 1944
Gap devation between: 1992 and 2012
Gap devation between: 2012 and 1920
Gap devation between: 1920 and 1900
Gap devation between: 1900 and 1988
Gap devation between: 1988 and 1994
Gap devation between: 1994 and 1932
Gap devation between: 1932 and 2002
Gap devation between: 2002 and 1952
Gap devation between: 1952 and 1980
Gap devation between: 1980 and 2000
Gap devation between: 2000 and 1996
Gap devation between: 1996 and 1912
Gap devation between: 1912 and 1924
Gap devation between: 1924 and 2014
Gap devation between: 2014 and 1948
Gap devation between: 1948 and 1998
Gap devation between: 1998 and 2006
Gap devation between: 2006 and 2008
Gap devation between: 2008 and 2016
Gap devation between: 2016 and 2004
Gap devation between: 2004 and 1960
Gap devation between: 1964 and 1984
Gap devation between: 1984 and 1968
Gap devation between: 1972 and 1936
Gap devation between: 1936 and 1956
Gap devation between: 1956 and 1928
Gap devati

In [11]:
df["season"].unique()

array(['Summer', 'Winter'], dtype=object)

In [12]:
sports = df["sport"].unique()
sports.sort()
sports


array(['Aeronautics', 'Alpine skiing', 'Alpinism', 'Archery',
       'Art competitions', 'Athletics', 'Badminton', 'Baseball',
       'Basketball', 'Basque pelota', 'Beach volleyball', 'Biathlon',
       'Bobsleigh', 'Boxing', 'Canoeing', 'Cricket', 'Croquet',
       'Cross country skiing', 'Curling', 'Cycling', 'Diving',
       'Equestrianism', 'Fencing', 'Figure skating', 'Football',
       'Freestyle skiing', 'Golf', 'Gymnastics', 'Handball', 'Hockey',
       'Ice hockey', 'Jeu de paume', 'Judo', 'Lacrosse', 'Luge',
       'Military ski patrol', 'Modern pentathlon', 'Motorboating',
       'Nordic combined', 'Polo', 'Racquets', 'Rhythmic gymnastics',
       'Roque', 'Rowing', 'Rugby', 'Rugby sevens', 'Sailing', 'Shooting',
       'Short track speed skating', 'Skeleton', 'Ski jumping',
       'Snowboarding', 'Softball', 'Speed skating', 'Swimming',
       'Synchronized swimming', 'Table tennis', 'Taekwondo', 'Tennis',
       'Trampolining', 'Triathlon', 'Tug-of-war', 'Volleyball',
   