# Data Cleaning Final

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_original = pd.read_csv("Dataset_Preparation.csv")
noc_to_countries = pd.read_csv("noc_regions.csv")

In [3]:
# only extract data after 1960 since the data before 1960 is too few
data = data_original[3220:]
data = data.reset_index(drop = True)
data = data.drop(columns = ["City Summer", "City Winter"])
data.head()

Unnamed: 0,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer,Gold Winter,Silver Winter,Bronze Winter,Total Medals Winter,NaN Winter,Total Winter
0,AFG,1960,0,0,0,0,16,16,0,0,0,0,0,0
1,AHO,1960,0,0,0,0,5,5,0,0,0,0,0,0
2,ALB,1960,0,0,0,0,0,0,0,0,0,0,0,0
3,ALG,1960,0,0,0,0,0,0,0,0,0,0,0,0
4,AND,1960,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
summer = pd.DataFrame(data[["NOCs", "Years", "Gold Summer", "Silver Summer", "Bronze Summer", "Total Medals Summer", "NaN Summer", "Total Summer"]])
summer = pd.DataFrame(summer[summer["Years"].isin([1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016])])
summer = summer.reset_index(drop = True)
summer = summer.dropna()
# only these years hold summer olymnpics in the given dataset

In [5]:
print(summer.shape)
summer.head()

(3450, 8)


Unnamed: 0,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer
0,AFG,1960,0,0,0,0,16,16
1,AHO,1960,0,0,0,0,5,5
2,ALB,1960,0,0,0,0,0,0
3,ALG,1960,0,0,0,0,0,0
4,AND,1960,0,0,0,0,0,0


In [6]:
winter = pd.DataFrame(data[["NOCs", "Years", "Gold Winter", "Silver Winter", "Bronze Winter", "Total Medals Winter", "NaN Winter", "Total Winter"]])
winter = pd.DataFrame(winter[winter["Years"].isin([1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1994, 1998, 2002, 2006, 2010, 2014])])
winter = winter.reset_index(drop = True)
winter = winter.dropna()
# only these years hold winter olymnpics in the given dataset

In [7]:
print(winter.shape)
winter.head()

(3450, 8)


Unnamed: 0,NOCs,Years,Gold Winter,Silver Winter,Bronze Winter,Total Medals Winter,NaN Winter,Total Winter
0,AFG,1960,0,0,0,0,0,0
1,AHO,1960,0,0,0,0,0,0
2,ALB,1960,0,0,0,0,0,0
3,ALG,1960,0,0,0,0,0,0
4,AND,1960,0,0,0,0,0,0


In [8]:
noc_to_countries = pd.read_csv("noc_regions.csv")
countries = list(noc_to_countries["region"])

# prepare regions for summer
def season_dataframe(season): # the input should be a string
    if season == "Summer":
        dataset = summer
    elif season == "Winter":
        dataset = winter
    season_year = list(dataset["Years"].unique())
    countries_season = countries * len(season_year)
    return pd.DataFrame(countries_season, columns = ["Country " + season])

# prepare regions for summer
#winter_year = list(winter["Years"].unique())
#countries_winter = countries * len(winter_year)
#countries_winter = pd.DataFrame(countries_winter, columns = ["Country Winter"])

countries_summer = season_dataframe("Summer")
countries_winter = season_dataframe("Winter")

In [9]:
# merge region into summer dataframe
summer = pd.concat([countries_summer, summer], axis = 1)

In [10]:
print(summer.shape)
summer.head()

(3450, 9)


Unnamed: 0,Country Summer,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer
0,Afghanistan,AFG,1960,0,0,0,0,16,16
1,Curacao,AHO,1960,0,0,0,0,5,5
2,Albania,ALB,1960,0,0,0,0,0,0
3,Algeria,ALG,1960,0,0,0,0,0,0
4,Andorra,AND,1960,0,0,0,0,0,0


In [11]:
# merge region into winter dataframe
winter = pd.concat([countries_winter, winter], axis = 1)

In [12]:
winter.head()

Unnamed: 0,Country Winter,NOCs,Years,Gold Winter,Silver Winter,Bronze Winter,Total Medals Winter,NaN Winter,Total Winter
0,Afghanistan,AFG,1960,0,0,0,0,0,0
1,Curacao,AHO,1960,0,0,0,0,0,0
2,Albania,ALB,1960,0,0,0,0,0,0
3,Algeria,ALG,1960,0,0,0,0,0,0
4,Andorra,AND,1960,0,0,0,0,0,0


In [13]:
# make every countries repeat more times
import itertools
def country_repeat(season, dataset): # input should be a string and a dataframe
    #if season == "Summer":
    #    length = summer_year
    #elif season == "Winter":
    #    length = winter_year
    country = list(dataset["Country"].unique())
    country = list(itertools.chain.from_iterable(itertools.repeat(x, 230) for x in country)) # 230 countries in total
    df = pd.DataFrame(country)
    df.columns = ["Host Country " + season]
    return df

In [14]:
summer_host = pd.read_csv("summer_host_table.csv")
summer_host = summer_host[:-1] # truncate the last year which is 2020
summer_host_df = country_repeat("Summer", summer_host)

In [15]:
summer = pd.concat([summer_host_df, summer], axis = 1)

In [16]:
summer.shape

(3450, 10)

In [17]:
summer.head(10)

Unnamed: 0,Host Country Summer,Country Summer,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer
0,Italy,Afghanistan,AFG,1960,0,0,0,0,16,16
1,Italy,Curacao,AHO,1960,0,0,0,0,5,5
2,Italy,Albania,ALB,1960,0,0,0,0,0,0
3,Italy,Algeria,ALG,1960,0,0,0,0,0,0
4,Italy,Andorra,AND,1960,0,0,0,0,0,0
5,Italy,Angola,ANG,1960,0,0,0,0,0,0
6,Italy,Antigua,ANT,1960,0,0,0,0,0,0
7,Italy,Australia,ANZ,1960,0,0,0,0,0,0
8,Italy,Argentina,ARG,1960,0,3,1,4,112,116
9,Italy,Armenia,ARM,1960,0,0,0,0,0,0


In [18]:
winter_host = pd.read_csv("winter_host_table.csv")
winter_host = winter_host[:-2] # truncate the last two rows which are 2018 and 2020
winter_host_df = country_repeat("Winter", winter_host)

In [19]:
winter = pd.concat([winter_host_df, winter], axis = 1)

In [20]:
print(winter.shape)
winter.head(20)

(3450, 10)


Unnamed: 0,Host Country Winter,Country Winter,NOCs,Years,Gold Winter,Silver Winter,Bronze Winter,Total Medals Winter,NaN Winter,Total Winter
0,United States,Afghanistan,AFG,1960,0,0,0,0,0,0
1,United States,Curacao,AHO,1960,0,0,0,0,0,0
2,United States,Albania,ALB,1960,0,0,0,0,0,0
3,United States,Algeria,ALG,1960,0,0,0,0,0,0
4,United States,Andorra,AND,1960,0,0,0,0,0,0
5,United States,Angola,ANG,1960,0,0,0,0,0,0
6,United States,Antigua,ANT,1960,0,0,0,0,0,0
7,United States,Australia,ANZ,1960,0,0,0,0,0,0
8,United States,Argentina,ARG,1960,0,0,0,0,14,14
9,United States,Armenia,ARM,1960,0,0,0,0,0,0


In [21]:
def pre_medals(season): #input should be a string
    if season == "Summer":
        dataset = summer
    elif season == "Winter":
        dataset = winter
    pre_season = pd.DataFrame(data_original[data_original["Years"] == 1956]["Total Medals " + season])
    pre_season.columns = ["Previous Year Medals " + season]
    temp = pd.DataFrame(dataset["Total Medals " + season][:-230])
    temp.columns = ["Previous Year Medals " + season] # so that the two dataframes can be concatenated
    pre_season = pd.concat([pre_season, temp], sort = True)
    pre_season = pre_season.reset_index(drop = True)
    return pre_season

In [22]:
pre_summer = pre_medals("Summer")

In [23]:
pre_summer.head(10)

Unnamed: 0,Previous Year Medals Summer
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,2
9,0


In [24]:
pre_winter = pre_medals("Winter")

In [25]:
pre_winter.head(10)

Unnamed: 0,Previous Year Medals Winter
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [26]:
summer = pd.concat([summer, pre_summer], axis = 1)

In [27]:
def host_or_not(season): #input should be a string:
    if season == "Summer":
        dataset = summer
    elif season == "Winter":
        dataset = winter
        
    season_host_y_list = []
    
    for i in range(len(dataset)):
        if dataset["Host Country " + season][i] == dataset["Country " + season][i]:
            season_host_y_list.append(True)
        else:
            season_host_y_list.append(False)
    return pd.DataFrame(season_host_y_list, columns = ["host_y"])

In [28]:
summer_host_y = host_or_not("Summer")

In [29]:
summer = pd.concat([summer, summer_host_y], axis = 1)

In [30]:
print(summer.shape)
summer.head(10)

(3450, 12)


Unnamed: 0,Host Country Summer,Country Summer,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer,Previous Year Medals Summer,host_y
0,Italy,Afghanistan,AFG,1960,0,0,0,0,16,16,0,False
1,Italy,Curacao,AHO,1960,0,0,0,0,5,5,0,False
2,Italy,Albania,ALB,1960,0,0,0,0,0,0,0,False
3,Italy,Algeria,ALG,1960,0,0,0,0,0,0,0,False
4,Italy,Andorra,AND,1960,0,0,0,0,0,0,0,False
5,Italy,Angola,ANG,1960,0,0,0,0,0,0,0,False
6,Italy,Antigua,ANT,1960,0,0,0,0,0,0,0,False
7,Italy,Australia,ANZ,1960,0,0,0,0,0,0,0,False
8,Italy,Argentina,ARG,1960,0,3,1,4,112,116,2,False
9,Italy,Armenia,ARM,1960,0,0,0,0,0,0,0,False


summer.to_csv("Summer DataFrame.csv", index = 0)

In [31]:
winter = pd.concat([winter, pre_winter], axis = 1)

In [32]:
winter_host_y = host_or_not("Winter")

In [33]:
winter = pd.concat([winter, winter_host_y], axis = 1)

In [34]:
print(winter.shape)
winter.head(20)

(3450, 12)


Unnamed: 0,Host Country Winter,Country Winter,NOCs,Years,Gold Winter,Silver Winter,Bronze Winter,Total Medals Winter,NaN Winter,Total Winter,Previous Year Medals Winter,host_y
0,United States,Afghanistan,AFG,1960,0,0,0,0,0,0,0,False
1,United States,Curacao,AHO,1960,0,0,0,0,0,0,0,False
2,United States,Albania,ALB,1960,0,0,0,0,0,0,0,False
3,United States,Algeria,ALG,1960,0,0,0,0,0,0,0,False
4,United States,Andorra,AND,1960,0,0,0,0,0,0,0,False
5,United States,Angola,ANG,1960,0,0,0,0,0,0,0,False
6,United States,Antigua,ANT,1960,0,0,0,0,0,0,0,False
7,United States,Australia,ANZ,1960,0,0,0,0,0,0,0,False
8,United States,Argentina,ARG,1960,0,0,0,0,14,14,0,False
9,United States,Armenia,ARM,1960,0,0,0,0,0,0,0,False


winter.to_csv("Winter DataFrame.csv", index = 0)

### GDP

In [35]:
countryLst = pd.read_html("http://www.ibiblio.org/units/codes/country.htm")
countryData = pd.concat(countryLst, ignore_index=True)
countryData = countryData.rename(columns=countryData.iloc[0])
countryData = countryData.drop(countryData.index[0])
countryData = countryData[countryData["Country Name"] != "Country Name"]
countryData.head()

Unnamed: 0,Country Name,ISO 2-alpha,ISO 3-alpha,IANA Internet,UN Vehicle,IOC Olympic,UN/ISO numeric,ITU calling
1,AFGHANISTAN,AF,AFG,.af,AFG,AFG,4.0,93.0
2,ÅLAND ISLANDS,AX,ALA,.ax,,,248.0,
3,ALBANIA,AL,ALB,.al,AL,ALB,8.0,355.0
4,ALDERNEY,,,,GBA,,,
5,ALGERIA (El Djazaïr),DZ,DZA,.dz,DZ,ALG,12.0,213.0


In [36]:
# IOC Olympic to olympic dataset, ISO 3-alpha to GDP data set
NOC_GDP = pd.DataFrame(countryData[["ISO  3-alpha", "IOC  Olympic"]]) # Do note the default column name there’s double spacing between words
NOC_GDP.shape

(261, 2)

In [37]:
GDP_summer = pd.read_csv("summer_gdp_table.csv")
GDP_summer

Unnamed: 0,Country Name,Country Code,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016
0,Aruba,ABW,,,,,,,,9764.79,14046.50,16586.07,20620.70,22569.97,27086.04,24709.60,25251.64
1,Afghanistan,AFG,59.78,82.21,129.51,136.12,199.03,274.88,,,,,,216.71,370.38,648.51,549.58
2,Angola,AGO,,,,,,664.12,596.60,761.62,640.62,512.62,555.30,1248.40,4068.98,5102.49,3509.60
3,Albania,ALB,,,,,,,639.48,652.77,200.85,1009.98,1126.68,2373.58,4370.54,4247.61,4131.87
4,Andorra,AND,,,,4217.17,7152.38,12377.41,7728.91,14304.36,20547.71,19017.17,21936.53,38503.48,47785.66,38391.08,37231.82
5,Arab World,ARB,,,221.32,338.77,996.62,2041.46,1655.73,1458.76,2006.80,2217.42,2590.22,3119.16,6116.58,7465.14,6185.06
6,United Arab Emirates,ARE,,,,,29698.17,41826.00,31709.25,21907.60,25993.56,28615.58,33071.27,36161.17,45758.91,42086.69,38517.80
7,Argentina,ARG,,1166.32,1136.52,1401.49,1932.59,2738.28,2643.37,3969.33,6798.03,7683.57,7669.27,4251.57,8953.36,12969.71,12654.35
8,Armenia,ARM,,,,,,,,,369.63,504.06,622.74,1191.96,4010.03,3684.80,3605.74
9,American Samoa,ASM,,,,,,,,,,,,8639.31,9872.00,11660.33,11744.82


In [38]:
for i in range(len(GDP_summer)):
    if GDP_summer["Country Code"][i] in NOC_GDP["ISO  3-alpha"].tolist():
        GDP_summer.loc[i, "Country Code"] = NOC_GDP.loc[NOC_GDP["ISO  3-alpha"] == GDP_summer.loc[i, "Country Code"], "IOC  Olympic"].tolist()[0]
            #GDP_summer["Country Code"][0] in NOC_GDP["ISO  3-alpha"].tolist()
#if GDP_summer["Country Code"][0] in NOC_GDP["ISO  3-alpha"].tolist():
#    GDP_summer.loc[0, "Country Code"] = NOC_GDP.loc[NOC_GDP["ISO  3-alpha"] == GDP_summer.loc[0, "Country Code"], "IOC  Olympic"].tolist()[0]
#GDP_summer["Country Code"][0]
#temp = NOC_GDP.loc[NOC_GDP["ISO  3-alpha"] == GDP_summer.loc[0, "Country Code"], "IOC  Olympic"].tolist()
#temp[0]

In [39]:
GDP_summer

Unnamed: 0,Country Name,Country Code,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016
0,Aruba,ARU,,,,,,,,9764.79,14046.50,16586.07,20620.70,22569.97,27086.04,24709.60,25251.64
1,Afghanistan,AFG,59.78,82.21,129.51,136.12,199.03,274.88,,,,,,216.71,370.38,648.51,549.58
2,Angola,ANG,,,,,,664.12,596.60,761.62,640.62,512.62,555.30,1248.40,4068.98,5102.49,3509.60
3,Albania,ALB,,,,,,,639.48,652.77,200.85,1009.98,1126.68,2373.58,4370.54,4247.61,4131.87
4,Andorra,AND,,,,4217.17,7152.38,12377.41,7728.91,14304.36,20547.71,19017.17,21936.53,38503.48,47785.66,38391.08,37231.82
5,Arab World,ARB,,,221.32,338.77,996.62,2041.46,1655.73,1458.76,2006.80,2217.42,2590.22,3119.16,6116.58,7465.14,6185.06
6,United Arab Emirates,UAE,,,,,29698.17,41826.00,31709.25,21907.60,25993.56,28615.58,33071.27,36161.17,45758.91,42086.69,38517.80
7,Argentina,ARG,,1166.32,1136.52,1401.49,1932.59,2738.28,2643.37,3969.33,6798.03,7683.57,7669.27,4251.57,8953.36,12969.71,12654.35
8,Armenia,ARM,,,,,,,,,369.63,504.06,622.74,1191.96,4010.03,3684.80,3605.74
9,American Samoa,ASA,,,,,,,,,,,,8639.31,9872.00,11660.33,11744.82


In [40]:
GDP_summer.shape

(263, 17)

In [41]:
# extract the information of the countries that participates in the olympic games
# recall: countries = list(noc_to_countries["region"])
NOC = list(noc_to_countries["NOC"])
GDP_games_summer = pd.DataFrame(GDP_summer[GDP_summer["Country Code"].isin(NOC)])
GDP_games_summer = GDP_games_summer.set_index("Country Code")
print(GDP_games_summer.shape)
GDP_games_summer

(203, 16)


Unnamed: 0_level_0,Country Name,1960,1964,1968,1972,1976,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ARU,Aruba,,,,,,,,9764.79,14046.50,16586.07,20620.70,22569.97,27086.04,24709.60,25251.64
AFG,Afghanistan,59.78,82.21,129.51,136.12,199.03,274.88,,,,,,216.71,370.38,648.51,549.58
ANG,Angola,,,,,,664.12,596.60,761.62,640.62,512.62,555.30,1248.40,4068.98,5102.49,3509.60
ALB,Albania,,,,,,,639.48,652.77,200.85,1009.98,1126.68,2373.58,4370.54,4247.61,4131.87
AND,Andorra,,,,4217.17,7152.38,12377.41,7728.91,14304.36,20547.71,19017.17,21936.53,38503.48,47785.66,38391.08,37231.82
UAE,United Arab Emirates,,,,,29698.17,41826.00,31709.25,21907.60,25993.56,28615.58,33071.27,36161.17,45758.91,42086.69,38517.80
ARG,Argentina,,1166.32,1136.52,1401.49,1932.59,2738.28,2643.37,3969.33,6798.03,7683.57,7669.27,4251.57,8953.36,12969.71,12654.35
ARM,Armenia,,,,,,,,,369.63,504.06,622.74,1191.96,4010.03,3684.80,3605.74
ASA,American Samoa,,,,,,,,,,,,8639.31,9872.00,11660.33,11744.82
ANT,Antigua and Barbuda,,,,,,1789.59,2946.24,5944.67,7296.55,8379.58,9932.03,10419.31,14797.37,12517.56,14506.60


In [43]:
summer = pd.concat([index, summer], axis = 1)

In [45]:
print(summer.shape)
summer.head(10)

(3450, 12)


Unnamed: 0_level_0,Host Country Summer,Country Summer,NOCs,Years,Gold Summer,Silver Summer,Bronze Summer,Total Medals Summer,NaN Summer,Total Summer,Previous Year Medals Summer,host_y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AFG,Italy,Afghanistan,AFG,1960,0,0,0,0,16,16,0,False
AHO,Italy,Curacao,AHO,1960,0,0,0,0,5,5,0,False
ALB,Italy,Albania,ALB,1960,0,0,0,0,0,0,0,False
ALG,Italy,Algeria,ALG,1960,0,0,0,0,0,0,0,False
AND,Italy,Andorra,AND,1960,0,0,0,0,0,0,0,False
ANG,Italy,Angola,ANG,1960,0,0,0,0,0,0,0,False
ANT,Italy,Antigua,ANT,1960,0,0,0,0,0,0,0,False
ANZ,Italy,Australia,ANZ,1960,0,0,0,0,0,0,0,False
ARG,Italy,Argentina,ARG,1960,0,3,1,4,112,116,2,False
ARM,Italy,Armenia,ARM,1960,0,0,0,0,0,0,0,False


In [70]:
GDP_summer_column = pd.DataFrame(columns = ["GDP Summer", "Years"]) 
#list(GDP_summer.columns)[2:]
for year in list(GDP_games_summer.columns)[2:]:
    NOC1 = pd.DataFrame(list(GDP_games_summer.index))
    NOC2 = NOC1.copy()
    NOC1.columns = ["index"]
    NOC2.columns = ["NOCs"]
    Year = [year] * len(GDP_games_summer)
    Year = pd.DataFrame(Year)
    Year.columns = ["Years"]
    temp1 = pd.concat([NOC1, NOC2, Year], axis = 1)
    
    temp1 = temp1.set_index("index") # so that the two dataframe have the same index, easier to concatenate together
    summer_GDP = pd.DataFrame(GDP_games_summer[year])
    summer_GDP.columns = ["GDP Summer"]
    temp = pd.concat([summer_GDP, temp1], axis = 1)
    GDP_summer_column = pd.concat([GDP_summer_column, temp], sort = False)

In [75]:
print(GDP_summer_column.shape)
GDP_summer_column.head(10)

(2842, 3)


Unnamed: 0,GDP Summer,Years,NOCs
ARU,,1964,ARU
AFG,82.21,1964,AFG
ANG,,1964,ANG
ALB,,1964,ALB
AND,,1964,AND
UAE,,1964,UAE
ARG,1166.32,1964,ARG
ARM,,1964,ARM
ASA,,1964,ASA
ANT,,1964,ANT


# 将GDP_summer_column弄成和summer一样的size

In [76]:
yearblock = pd.DataFrame(GDP_summer_column[GDP_summer_column["Years"] == 1964])
yearblock

Unnamed: 0,GDP Summer,Years,NOCs


In [None]:
print(summer.shape)
summer

In [None]:
summer = pd.concat([summer, GDP_summer_column], axis = 1, sort = False)