# Data Cleaning For Our Main Dataset

In [1]:
import pandas as pd

### Import the relevant .csv files.

In [2]:
main_dataset = pd.read_csv("data/athlete_events.csv")
noc_dataset = pd.read_csv("data/noc_regions.csv")

Exploratory data analysis for main dataset.

In [3]:
main_dataset.shape

(271116, 15)

In [4]:
main_dataset.dtypes

ID          int64
Name       object
Sex        object
Age       float64
Height    float64
Weight    float64
Team       object
NOC        object
Games      object
Year        int64
Season     object
City       object
Sport      object
Event      object
Medal      object
dtype: object

In [5]:
main_dataset.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


### Clean the NOC regions dataset due to missing values in "region" (replace with values under "notes" instead).

In [6]:
null_rows = noc_dataset[noc_dataset["region"].isnull()]
print(null_rows, "\n\nMissing values replaced:\n")

for index in null_rows.index:
    noc_dataset["region"][index] = noc_dataset["notes"][index]
    print(noc_dataset.loc[index])

     NOC region                 notes
168  ROT    NaN  Refugee Olympic Team
208  TUV    NaN                Tuvalu
213  UNK    NaN               Unknown 

Missing values replaced:

NOC                        ROT
region    Refugee Olympic Team
notes     Refugee Olympic Team
Name: 168, dtype: object
NOC          TUV
region    Tuvalu
notes     Tuvalu
Name: 208, dtype: object
NOC           UNK
region    Unknown
notes     Unknown
Name: 213, dtype: object


### Add a country name column (use NOC regions dataset; code to name conversion), then rename the columns for readability.

In [7]:
main_dataset = main_dataset.merge(noc_dataset, left_on = "NOC", right_on = "NOC")
main_dataset = main_dataset.rename(columns = {"NOC": "CountryCode", "region": "CountryName"})
main_dataset.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,CountryCode,Games,Year,Season,City,Sport,Event,Medal,CountryName,notes
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China,
2,602,Abudoureheman,M,22.0,182.0,75.0,China,CHN,2000 Summer,2000,Summer,Sydney,Boxing,Boxing Men's Middleweight,,China,
3,1463,Ai Linuer,M,25.0,160.0,62.0,China,CHN,2004 Summer,2004,Summer,Athina,Wrestling,"Wrestling Men's Lightweight, Greco-Roman",,China,
4,1464,Ai Yanhan,F,14.0,168.0,54.0,China,CHN,2016 Summer,2016,Summer,Rio de Janeiro,Swimming,Swimming Women's 200 metres Freestyle,,China,


### Weed out the only columns we need.

In [8]:
main_dataset = main_dataset[["CountryCode", "CountryName", "Year", "Season", "Medal"]]
main_dataset.head()

Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,CHN,China,1992,Summer,
1,CHN,China,2012,Summer,
2,CHN,China,2000,Summer,
3,CHN,China,2004,Summer,
4,CHN,China,2016,Summer,


### Check if there are any null values in the columns.

In [9]:
main_dataset.isnull().any()

CountryCode    False
CountryName    False
Year           False
Season         False
Medal           True
dtype: bool

##### Medal column has null values. We will be replacing it with the value 0.

In [10]:
main_dataset["Medal"].fillna(value = 0, inplace = True)
main_dataset.head()

Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,CHN,China,1992,Summer,0
1,CHN,China,2012,Summer,0
2,CHN,China,2000,Summer,0
3,CHN,China,2004,Summer,0
4,CHN,China,2016,Summer,0


### Sort our dataset by year (numerical order), country (alphabetical order), and season (summer first, then winter), and reset indexing to 0.
Also there are some faulty country code values in the original dataset. This is not unexpected seeing as the csv file is over 270,000 rows long...

In [11]:
#change faulty country code values.
faulty_codes_dict = {"MAS": "MAL", "NBO": "MAL",
                    "TCH": "CZE",
                    "YUG": "SRB", "SCG": "SRB",
                    "TTO": "TRI", "WIF": "TRI",
                    "SAA": "GER", "GDR": "GER",
                    "URS": "RUS", "EUN": "RUS",
                    "VNM": "VIE",
                    "RHO": "ZIM",
                    "UAR": "SYR",
                    "YAR": "YEM", "YMD": "YEM",
                    "SSD": "SUD"}

main_dataset = main_dataset.replace(to_replace = faulty_codes_dict)

#sort by year and country
main_dataset = main_dataset.sort_values(by = ["Year", "CountryCode", "Season"])
main_dataset = main_dataset.reset_index(drop = True)
main_dataset.head()

Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,AUS,Australia,1896,Summer,0
1,AUS,Australia,1896,Summer,Bronze
2,AUS,Australia,1896,Summer,Gold
3,AUS,Australia,1896,Summer,Gold
4,AUS,Australia,1896,Summer,0


In [13]:
main_dataset

Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,AUS,Australia,1896,Summer,0
1,AUS,Australia,1896,Summer,Bronze
2,AUS,Australia,1896,Summer,Gold
3,AUS,Australia,1896,Summer,Gold
4,AUS,Australia,1896,Summer,0
5,AUT,Austria,1896,Summer,Silver
6,AUT,Austria,1896,Summer,Gold
7,AUT,Austria,1896,Summer,0
8,AUT,Austria,1896,Summer,Bronze
9,AUT,Austria,1896,Summer,0


Done cleaning main dataset.

# Create our 2 Ultimate Datasets: Summer and Winter

### Brainstorming
- Sort primarily by year, then secondarily by country. 
- A reminder that later than 1992, Summer and Winter olympics no longer play the same year. Instead, they alternate 2 years.

### Columns we need for Summer Dataset and Winter Dataset
1. Country Code
- Year
- Season of Olympics
- Total Participants
- Gold Medals
- Silver Medals
- Bronze Medals
- Total Medals
### Columns for Predictors; data only from 1960s and above

9. GDP Per Capita
- Population Size
- Host City
- Medal Count from Same Season's Games (Winter if Summer_Dataset, and Summer if Winter_Dataset)
- Medal Count from Previous Season's Games

### Importing relevant .csv files from our data cleaning notebook for external excel files (GDP Per Capita, Host City, Population Size)

In [14]:
summer_gdp_dataset = pd.read_csv("predictordata/summer_gdp_dataset.csv")
summer_pop_dataset = pd.read_csv("predictordata/summer_pop_dataset.csv")
summer_host_dataset = pd.read_csv("predictordata/summer_host_dataset.csv")
winter_gdp_dataset = pd.read_csv("predictordata/winter_gdp_dataset.csv")
winter_pop_dataset = pd.read_csv("predictordata/winter_pop_dataset.csv")
winter_host_dataset = pd.read_csv("predictordata/winter_host_dataset.csv")

### Create the 3 datasets.

In [32]:
olympics_cols = ["CountryCode", "Year", "OlympicsSeason", "TotalParticipants", "GoldMedals", "SilverMedals", "BronzeMedals", "TotalMedals", "Win%", "GDPPerCapita", "PopulationSize", "HostCity"]

summer_dataset = pd.DataFrame(columns = olympics_cols)
winter_dataset = pd.DataFrame(columns = olympics_cols)

summer_dataset.columns

Index(['CountryCode', 'Year', 'OlympicsSeason', 'TotalParticipants',
       'GoldMedals', 'SilverMedals', 'BronzeMedals', 'TotalMedals', 'Win%',
       'GDPPerCapita', 'PopulationSize', 'HostCity'],
      dtype='object')

### Get unique values for country and year and store them in arrays.

In [16]:
countries_list = main_dataset.CountryCode.unique()
countries_list.sort()
years_list = main_dataset.Year.unique()

print(countries_list, "\n")
print(years_list)

['AFG' 'AHO' 'ALB' 'ALG' 'AND' 'ANG' 'ANT' 'ANZ' 'ARG' 'ARM' 'ARU' 'ASA'
 'AUS' 'AUT' 'AZE' 'BAN' 'BAR' 'BDI' 'BEL' 'BEN' 'BHU' 'BIH' 'BIZ' 'BLR'
 'BOH' 'BOL' 'BOT' 'BRA' 'BRN' 'BRU' 'BUL' 'BUR' 'CAF' 'CAM' 'CAN' 'CAY'
 'CGO' 'CHA' 'CHI' 'CHN' 'CIV' 'CMR' 'COD' 'COK' 'COL' 'COM' 'CPV' 'CRC'
 'CRO' 'CRT' 'CUB' 'CYP' 'CZE' 'DEN' 'DJI' 'DMA' 'DOM' 'ECU' 'EGY' 'ERI'
 'ESA' 'ESP' 'EST' 'ETH' 'FIJ' 'FIN' 'FRA' 'FSM' 'GAB' 'GAM' 'GBR' 'GBS'
 'GEO' 'GEQ' 'GER' 'GHA' 'GRE' 'GRN' 'GUA' 'GUI' 'GUM' 'GUY' 'HAI' 'HKG'
 'HON' 'HUN' 'INA' 'IND' 'IOA' 'IRI' 'IRL' 'IRQ' 'ISL' 'ISR' 'ISV' 'ITA'
 'IVB' 'JAM' 'JOR' 'JPN' 'KAZ' 'KEN' 'KGZ' 'KIR' 'KOR' 'KOS' 'KSA' 'KUW'
 'LAO' 'LAT' 'LBA' 'LBR' 'LCA' 'LES' 'LIB' 'LIE' 'LTU' 'LUX' 'MAD' 'MAR'
 'MAS' 'MAW' 'MDA' 'MDV' 'MEX' 'MGL' 'MHL' 'MKD' 'MLI' 'MLT' 'MNE' 'MON'
 'MOZ' 'MRI' 'MTN' 'MYA' 'NAM' 'NCA' 'NED' 'NEP' 'NFL' 'NGR' 'NIG' 'NOR'
 'NRU' 'NZL' 'OMA' 'PAK' 'PAN' 'PAR' 'PER' 'PHI' 'PLE' 'PLW' 'PNG' 'POL'
 'POR' 'PRK' 'PUR' 'QAT' 'ROT' 'ROU' 'RSA' 'RUS' 'R

### Clean the main dataset for the summer dataset.

In [17]:
summer_main_dataset = main_dataset
summer_main_dataset = summer_main_dataset[summer_main_dataset.Season == "Summer"]
summer_main_dataset = summer_main_dataset.reset_index(drop = True)
summer_years_list = [year for year in years_list if year%4 == 0 or year == 1906]

print(summer_years_list)
summer_main_dataset.head()

[1896, 1900, 1904, 1906, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016]


Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,AUS,Australia,1896,Summer,0
1,AUS,Australia,1896,Summer,Bronze
2,AUS,Australia,1896,Summer,Gold
3,AUS,Australia,1896,Summer,Gold
4,AUS,Australia,1896,Summer,0


# Making the summer dataset.

In [18]:
i = 0
j = 0

for year in summer_years_list:
    for country in countries_list:
        total_participants = 0
        gold = 0
        silver = 0
        bronze = 0

        try: 
            while summer_main_dataset["CountryCode"][i] == country:
                if summer_main_dataset["Medal"][i] == "Gold":
                    gold += 1
                elif summer_main_dataset["Medal"][i] == "Silver":
                    silver += 1
                elif summer_main_dataset["Medal"][i] == "Bronze":
                    bronze += 1
                total_participants += 1
                i += 1
        except KeyError:
            pass
            
        if total_participants == 0:
            continue
        
        summer_dict = {"CountryCode": country, 
                       "Year": year, 
                       "OlympicsSeason": (year-1896)//4 + 1, 
                       "TotalParticipants": total_participants, 
                       "GoldMedals": gold, 
                       "SilverMedals": silver,
                       "BronzeMedals": bronze, 
                       "TotalMedals": gold + silver + bronze,
                       "Win%": round((gold+silver+bronze)/total_participants*100, 2)}
        
        #predictor data available
        if year >= 1960:
            if country in ["AHO", "TPE", "COK", "IOA", "ERI", "KOS", "ROT"]: #countries with no predictor data
                continue
                
            if summer_host_dataset["CountryCode"][j] == country and summer_host_dataset["Year"][j] == year:
                is_host_city = 1
                j += 1
            else:
                is_host_city = 0
            
            summer_dict.update({"GDPPerCapita": summer_gdp_dataset.loc[summer_gdp_dataset["CountryCode"] == country, str(year)].iloc[0],
                                "PopulationSize": summer_pop_dataset.loc[summer_pop_dataset["CountryCode"] == country, str(year)].iloc[0],
                                "HostCity": is_host_city})
        
        summer_dataset = summer_dataset.append(summer_dict, ignore_index = True)

summer_dataset.head()

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,PopulationSize,HostCity
0,AUS,1896,1,5,2,0,1,3,60.0,,,
1,AUT,1896,1,8,2,1,2,5,62.5,,,
2,DEN,1896,1,15,1,2,3,6,40.0,,,
3,FRA,1896,1,26,5,4,2,11,42.31,,,
4,GBR,1896,1,25,3,3,3,9,36.0,,,


### Clean the main dataset for the winter dataset.

In [19]:
winter_main_dataset = main_dataset
winter_main_dataset = winter_main_dataset[winter_main_dataset.Season == "Winter"]
winter_main_dataset = winter_main_dataset.reset_index(drop = True)
winter_years_list = [year for year in years_list if (1924<=year<=1992 and year%4==0) or (year>1992 and (year+2)%4==0)]

print(winter_years_list)
winter_main_dataset.head()

[1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1994, 1998, 2002, 2006, 2010, 2014]


Unnamed: 0,CountryCode,CountryName,Year,Season,Medal
0,AUS,Australia,1924,Winter,Gold
1,AUT,Austria,1924,Winter,Gold
2,AUT,Austria,1924,Winter,Silver
3,AUT,Austria,1924,Winter,Gold
4,AUT,Austria,1924,Winter,Gold


# Making the winter dataset.

In [20]:
i = 0
j = 0

for year in winter_years_list:
      
    for country in countries_list:
        total_participants = 0
        gold = 0
        silver = 0
        bronze = 0

        try:
            while winter_main_dataset["CountryCode"][i] == country:
                if winter_main_dataset["Medal"][i] == "Gold":
                    gold += 1
                elif winter_main_dataset["Medal"][i] == "Silver":
                    silver += 1
                elif winter_main_dataset["Medal"][i] == "Bronze":
                    bronze += 1
                total_participants += 1
                i += 1
        except KeyError:
            pass
        
        if total_participants == 0:
            continue
        
        winter_dict = {"CountryCode": country, 
                       "Year": year, 
                       "OlympicsSeason": (year-1924)//4 + 8, #standardize with summer olympics
                       "TotalParticipants": total_participants, 
                       "GoldMedals": gold, 
                       "SilverMedals": silver,
                       "BronzeMedals": bronze, 
                       "TotalMedals": gold + silver + bronze,
                       "Win%": round((gold+silver+bronze)/total_participants*100, 2)}
        
        #predictor data available
        if year >= 1960:
            if country in ["AHO", "TPE", "COK", "IOA", "ERI", "KOS", "ROT"]: #countries with no predictor data
                continue
                
            if winter_host_dataset["CountryCode"][j] == country and winter_host_dataset["Year"][j] == year:
                is_host_city = 1
                j += 1
            else:
                is_host_city = 0
            
            winter_dict.update({"GDPPerCapita": winter_gdp_dataset.loc[winter_gdp_dataset["CountryCode"] == country, str(year)].iloc[0],
                                "PopulationSize": winter_pop_dataset.loc[winter_pop_dataset["CountryCode"] == country, str(year)].iloc[0],
                                "HostCity": is_host_city})
            
        winter_dataset = winter_dataset.append(winter_dict, ignore_index = True)

winter_dataset.head()

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,PopulationSize,HostCity
0,AUS,1924,8,1,1,0,0,1,100.0,,,
1,AUT,1924,8,4,3,1,0,4,100.0,,,
2,BEL,1924,8,32,0,0,5,5,15.62,,,
3,CAN,1924,8,17,9,0,0,9,52.94,,,
4,CZE,1924,8,31,0,0,0,0,0.0,,,


### For the summer dataset, add the medals earned in the previous season.

In [21]:
temp_summer_dataset = summer_dataset[["OlympicsSeason", "CountryCode", "TotalParticipants", 
                                      "GoldMedals", "SilverMedals", "BronzeMedals", "TotalMedals"]]
temp_summer_dataset = temp_summer_dataset.rename(columns = {"TotalParticipants": "PrevTotalParticipants",
                                                            "GoldMedals": "PrevGoldMedals",
                                                            "SilverMedals": "PrevSilverMedals",
                                                            "BronzeMedals": "PrevBronzeMedals",
                                                            "TotalMedals": "PrevTotalMedals"})
temp_summer_dataset["OlympicsSeason"] += 1

summer_dataset = pd.merge(summer_dataset, temp_summer_dataset, on = ["OlympicsSeason", "CountryCode"], how = "left")

summer_dataset.head()

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,PopulationSize,HostCity,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals
0,AUS,1896,1,5,2,0,1,3,60.0,,,,,,,,
1,AUT,1896,1,8,2,1,2,5,62.5,,,,,,,,
2,DEN,1896,1,15,1,2,3,6,40.0,,,,,,,,
3,FRA,1896,1,26,5,4,2,11,42.31,,,,,,,,
4,GBR,1896,1,25,3,3,3,9,36.0,,,,,,,,


### For the winter dataset, add the medals earned in the previous season.

In [22]:
temp_winter_dataset = winter_dataset[["OlympicsSeason", "CountryCode", "TotalParticipants", 
                                      "GoldMedals", "SilverMedals", "BronzeMedals", "TotalMedals"]]
temp_winter_dataset = temp_winter_dataset.rename(columns = {"TotalParticipants": "PrevTotalParticipants",
                                                            "GoldMedals": "PrevGoldMedals",
                                                            "SilverMedals": "PrevSilverMedals",
                                                            "BronzeMedals": "PrevBronzeMedals",
                                                            "TotalMedals": "PrevTotalMedals"})
temp_winter_dataset["OlympicsSeason"] += 1

winter_dataset = pd.merge(winter_dataset, temp_winter_dataset, on = ["OlympicsSeason", "CountryCode"], how = "left")

winter_dataset.head()

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,PopulationSize,HostCity,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals
0,AUS,1924,8,1,1,0,0,1,100.0,,,,,,,,
1,AUT,1924,8,4,3,1,0,4,100.0,,,,,,,,
2,BEL,1924,8,32,0,0,5,5,15.62,,,,,,,,
3,CAN,1924,8,17,9,0,0,9,52.94,,,,,,,,
4,CZE,1924,8,31,0,0,0,0,0.0,,,,,,,,


### For summer dataset, add the winter medals (gold, silver, bronze, total) earned in the same season.

In [23]:
temp_winter_dataset = winter_dataset[["OlympicsSeason", "CountryCode", "TotalParticipants", 
                                      "GoldMedals", "SilverMedals", "BronzeMedals", "TotalMedals"]]
temp_winter_dataset = temp_winter_dataset.rename(columns = {"TotalParticipants": "PrevWinterTotalParticipants",
                                                           "GoldMedals": "PrevWinterGoldMedals",
                                                           "SilverMedals": "PrevWinterSilverMedals",
                                                           "BronzeMedals": "PrevWinterBronzeMedals",
                                                           "TotalMedals": "PrevWinterTotalMedals"})

summer_dataset = pd.merge(summer_dataset, temp_winter_dataset, on = ["OlympicsSeason", "CountryCode"], how = "left")

print(summer_dataset.shape)
summer_dataset.head()

(2812, 22)


Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevWinterTotalParticipants,PrevWinterGoldMedals,PrevWinterSilverMedals,PrevWinterBronzeMedals,PrevWinterTotalMedals
0,AUS,1896,1,5,2,0,1,3,60.0,,...,,,,,,,,,,
1,AUT,1896,1,8,2,1,2,5,62.5,,...,,,,,,,,,,
2,DEN,1896,1,15,1,2,3,6,40.0,,...,,,,,,,,,,
3,FRA,1896,1,26,5,4,2,11,42.31,,...,,,,,,,,,,
4,GBR,1896,1,25,3,3,3,9,36.0,,...,,,,,,,,,,


### For winter dataset, add the summer medals (gold, silver, bronze, total) earned in the same season.

In [24]:
temp_summer_dataset = summer_dataset[["OlympicsSeason", "CountryCode", "TotalParticipants", "GoldMedals", 
                                      "SilverMedals", "BronzeMedals", "TotalMedals"]]
temp_summer_dataset = temp_summer_dataset.rename(columns = {"TotalParticipants": "PrevSummerTotalParticipants",
                                                           "GoldMedals": "PrevSummerGoldMedals",
                                                           "SilverMedals": "PrevSummerSilverMedals",
                                                           "BronzeMedals": "PrevSummerBronzeMedals",
                                                           "TotalMedals": "PrevSummerTotalMedals"})

winter_dataset = pd.merge(winter_dataset, temp_summer_dataset, on = ["OlympicsSeason", "CountryCode"], how = "left")

print(winter_dataset.shape)
winter_dataset.head()

(1236, 22)


Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevSummerTotalParticipants,PrevSummerGoldMedals,PrevSummerSilverMedals,PrevSummerBronzeMedals,PrevSummerTotalMedals
0,AUS,1924,8,1,1,0,0,1,100.0,,...,,,,,,60,3,5,2,10
1,AUT,1924,8,4,3,1,0,4,100.0,,...,,,,,,56,0,3,1,4
2,BEL,1924,8,32,0,0,5,5,15.62,,...,,,,,,234,3,30,6,39
3,CAN,1924,8,17,9,0,0,9,52.94,,...,,,,,,83,0,19,1,20
4,CZE,1924,8,31,0,0,0,0,0.0,,...,,,,,,242,1,4,5,10


### Again, check for missing values.

In [25]:
print("Summer Dataset Missing Values:", "\n\n", summer_dataset.isnull().any(), "\n")
print("Winter Dataset Missing Values:", "\n\n", winter_dataset.isnull().any())

Summer Dataset Missing Values: 

 CountryCode                    False
Year                           False
OlympicsSeason                 False
TotalParticipants              False
GoldMedals                     False
SilverMedals                   False
BronzeMedals                   False
TotalMedals                    False
Win%                           False
GDPPerCapita                    True
PopulationSize                  True
HostCity                        True
PrevTotalParticipants           True
PrevGoldMedals                  True
PrevSilverMedals                True
PrevBronzeMedals                True
PrevTotalMedals                 True
PrevWinterTotalParticipants     True
PrevWinterGoldMedals            True
PrevWinterSilverMedals          True
PrevWinterBronzeMedals          True
PrevWinterTotalMedals           True
dtype: bool 

Winter Dataset Missing Values: 

 CountryCode                    False
Year                           False
OlympicsSeason                

##### Fill in the NaN values for HostCity & all columns starting with "Prev", with 0.

In [26]:
summer_dataset.fillna(value = 0, inplace = True)
winter_dataset.fillna(value = 0, inplace = True)

##### Fill in the NaN values for GDP and Population Size, with the median values of the entire column.

In [27]:
summer_dataset["GDPPerCapita"].replace(0, summer_dataset["GDPPerCapita"].median(), inplace = True)
summer_dataset["PopulationSize"].replace(0, summer_dataset["PopulationSize"].median(), inplace = True)

winter_dataset["GDPPerCapita"].replace(0, winter_dataset["GDPPerCapita"].median(), inplace = True)
winter_dataset["PopulationSize"].replace(0, winter_dataset["PopulationSize"].median(), inplace = True)

### Import summer_dataset and winter_dataset as .csv files

In [28]:
summer_dataset.to_csv("summer_dataset.csv", index = 0)
winter_dataset.to_csv("winter_dataset.csv", index = 0)

# Final Summer Dataset

In [29]:
summer_dataset

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevWinterTotalParticipants,PrevWinterGoldMedals,PrevWinterSilverMedals,PrevWinterBronzeMedals,PrevWinterTotalMedals
0,AUS,1896,1,5,2,0,1,3,60.00,921.791327,...,0,0,0,0,0,0,0,0,0,0
1,AUT,1896,1,8,2,1,2,5,62.50,921.791327,...,0,0,0,0,0,0,0,0,0,0
2,DEN,1896,1,15,1,2,3,6,40.00,921.791327,...,0,0,0,0,0,0,0,0,0,0
3,FRA,1896,1,26,5,4,2,11,42.31,921.791327,...,0,0,0,0,0,0,0,0,0,0
4,GBR,1896,1,25,3,3,3,9,36.00,921.791327,...,0,0,0,0,0,0,0,0,0,0
5,GER,1896,1,94,25,5,2,32,34.04,921.791327,...,0,0,0,0,0,0,0,0,0,0
6,GRE,1896,1,148,10,18,20,48,32.43,921.791327,...,0,0,0,0,0,0,0,0,0,0
7,HUN,1896,1,18,2,1,3,6,33.33,921.791327,...,0,0,0,0,0,0,0,0,0,0
8,ITA,1896,1,1,0,0,0,0,0.00,921.791327,...,0,0,0,0,0,0,0,0,0,0
9,SUI,1896,1,8,1,2,0,3,37.50,921.791327,...,0,0,0,0,0,0,0,0,0,0


# Final Winter Dataset

In [31]:
winter_dataset

Unnamed: 0,CountryCode,Year,OlympicsSeason,TotalParticipants,GoldMedals,SilverMedals,BronzeMedals,TotalMedals,Win%,GDPPerCapita,...,PrevTotalParticipants,PrevGoldMedals,PrevSilverMedals,PrevBronzeMedals,PrevTotalMedals,PrevSummerTotalParticipants,PrevSummerGoldMedals,PrevSummerSilverMedals,PrevSummerBronzeMedals,PrevSummerTotalMedals
0,AUS,1924,8,1,1,0,0,1,100.00,3844.951150,...,0,0,0,0,0,60,3,5,2,10
1,AUT,1924,8,4,3,1,0,4,100.00,3844.951150,...,0,0,0,0,0,56,0,3,1,4
2,BEL,1924,8,32,0,0,5,5,15.62,3844.951150,...,0,0,0,0,0,234,3,30,6,39
3,CAN,1924,8,17,9,0,0,9,52.94,3844.951150,...,0,0,0,0,0,83,0,19,1,20
4,CZE,1924,8,31,0,0,0,0,0.00,3844.951150,...,0,0,0,0,0,242,1,4,5,10
5,FIN,1924,8,33,4,8,3,15,45.45,3844.951150,...,0,0,0,0,0,253,24,13,15,52
6,FRA,1924,8,68,0,0,10,10,14.71,3844.951150,...,0,0,0,0,0,636,37,51,22,110
7,GBR,1924,8,50,16,4,11,31,62.00,3844.951150,...,0,0,0,0,0,428,15,30,19,64
8,HUN,1924,8,6,0,0,0,0,0.00,3844.951150,...,0,0,0,0,0,140,2,12,8,22
9,IND,1924,8,7,7,0,0,7,100.00,3844.951150,...,0,0,0,0,0,24,0,0,0,0
