In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Numpy

In [2]:
arr = np.array([1, 2, 3, 4, 5])
arr[0]

np.int64(1)

In [3]:
array1 = np.array([[1, 2, 3], [4, 5, 6]])
array2 = np.array([[7, 8, 9], [10, 11, 12]])

print(array1[0, 1])
print(array1[1])

2
[4 5 6]


In [4]:
array3 = array1 * array2
print(array3)

[[ 7 16 27]
 [40 55 72]]


## Series

In [5]:
series1 = pd.Series([1, 2, 3, 4, 5])
series1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
series2 = pd.Series({'a': 1, 'b': 2, 'c': 3})
series2

a    1
b    2
c    3
dtype: int64

In [7]:
# Series methods
print(series1.mean())
print(series1.sum())
print(series1.max())
print(series1.min())
print(series1.std())

3.0
15
5
1
1.5811388300841898


In [8]:
series1.describe()

count    5.000000
mean     3.000000
std      1.581139
min      1.000000
25%      2.000000
50%      3.000000
75%      4.000000
max      5.000000
dtype: float64

In [9]:
series1.name = 'My Series'
series1.index = ['a', 'b', 'c', 'd', 'e']
print(series1)

a    1
b    2
c    3
d    4
e    5
Name: My Series, dtype: int64


## DataFrame

In [10]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9],
    'D': [10, 11, 12]
})

df

Unnamed: 0,A,B,C,D
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12


In [11]:
df_from_series = pd.DataFrame([series1, series2])
print(df_from_series)

             a    b    c    d    e
My Series  1.0  2.0  3.0  4.0  5.0
Unnamed 0  1.0  2.0  3.0  NaN  NaN


In [12]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [13]:
df.shape

(3, 4)

## Athletes Event

In [14]:
# Read a CSV file
df = pd.read_csv('athlete_events.csv', index_col='ID')
# df = pd.read_excel('athlete_events.xlsx')

### Data inspection



In [15]:
df

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135569,Andrzej ya,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
135571,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [16]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [17]:
df.tail(3)  # Display the last few rows of the DataFrame

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
135571,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,
135571,Tomasz Ireneusz ya,M,34.0,185.0,96.0,Poland,POL,2002 Winter,2002,Winter,Salt Lake City,Bobsleigh,Bobsleigh Men's Four,


In [18]:
df.sample(3)

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
78762,Mary Mensah (-Afriyie),F,21.0,165.0,50.0,Ghana,GHA,1984 Summer,1984,Summer,Los Angeles,Athletics,Athletics Women's 4 x 400 metres Relay,
131233,"Leopold ""Poldl"" Wohlrab",M,24.0,,,Austria,AUT,1936 Summer,1936,Summer,Berlin,Handball,Handball Men's Handball,Silver
128600,Wang Xiaoyan,F,20.0,156.0,45.0,China,CHN,1988 Summer,1988,Summer,Seoul,Gymnastics,Gymnastics Women's Team All-Around,


In [19]:
df.shape

(271116, 14)

In [20]:
df.columns

Index(['Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')

In [21]:
df.describe()  # Get a summary of the DataFrame

Unnamed: 0,Age,Height,Weight,Year
count,261642.0,210945.0,208241.0,271116.0
mean,25.556898,175.33897,70.702393,1978.37848
std,6.393561,10.518462,14.34802,29.877632
min,10.0,127.0,25.0,1896.0
25%,21.0,168.0,60.0,1960.0
50%,24.0,175.0,70.0,1988.0
75%,28.0,183.0,79.0,2002.0
max,97.0,226.0,214.0,2016.0


In [22]:
df.value_counts()

Name                                  Sex  Age   Height  Weight  Team           NOC  Games        Year  Season  City         Sport                 Event                                       Medal 
A. Joshua "Josh" West                 M    31.0  207.0   105.0   Great Britain  GBR  2008 Summer  2008  Summer  Beijing      Rowing                Rowing Men's Coxed Eights                   Silver    1
Nicole Elaine "Nikki" Hudson (Mott-)  F    24.0  165.0   54.0    Australia      AUS  2000 Summer  2000  Summer  Sydney       Hockey                Hockey Women's Hockey                       Gold      1
Nicole Marie Davis                    F    30.0  167.0   66.0    United States  USA  2012 Summer  2012  Summer  London       Volleyball            Volleyball Women's Volleyball               Silver    1
                                           26.0  167.0   66.0    United States  USA  2008 Summer  2008  Summer  Beijing      Volleyball            Volleyball Women's Volleyball               Si

### Indexing

In [23]:
df['Age']

ID
1         24.0
2         23.0
3         24.0
4         34.0
5         21.0
          ... 
135569    29.0
135570    27.0
135570    27.0
135571    30.0
135571    34.0
Name: Age, Length: 271116, dtype: float64

In [24]:
df[['Name', 'Age', 'Height']]

Unnamed: 0_level_0,Name,Age,Height
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A Dijiang,24.0,180.0
2,A Lamusi,23.0,170.0
3,Gunnar Nielsen Aaby,24.0,
4,Edgar Lindenau Aabye,34.0,
5,Christine Jacoba Aaftink,21.0,185.0
...,...,...,...
135569,Andrzej ya,29.0,179.0
135570,Piotr ya,27.0,176.0
135570,Piotr ya,27.0,176.0
135571,Tomasz Ireneusz ya,30.0,185.0


 ### Label-based indexing

In [25]:
df = pd.read_csv('athlete_events.csv', index_col='Name')

In [26]:
df.loc['A Dijiang'] # Access a specific row by index label

ID                                  1
Sex                                 M
Age                              24.0
Height                          180.0
Weight                           80.0
Team                            China
NOC                               CHN
Games                     1992 Summer
Year                             1992
Season                         Summer
City                        Barcelona
Sport                      Basketball
Event     Basketball Men's Basketball
Medal                             NaN
Name: A Dijiang, dtype: object

In [27]:
df.loc[['A Dijiang', 'A Lamusi']]  # Access multiple rows by index labels

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,


In [28]:
df.loc['A Dijiang':'Edgar Lindenau Aabye']  # Access a range of rows by index labels

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
Gunnar Nielsen Aaby,3,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
Edgar Lindenau Aabye,4,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold


In [29]:
df.loc[['A Dijiang'], 'Age': 'Team']  # Access specific columns for a specific row

Unnamed: 0_level_0,Age,Height,Weight,Team
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A Dijiang,24.0,180.0,80.0,China


In [30]:
df.index

Index(['A Dijiang', 'A Lamusi', 'Gunnar Nielsen Aaby', 'Edgar Lindenau Aabye',
       'Christine Jacoba Aaftink', 'Christine Jacoba Aaftink',
       'Christine Jacoba Aaftink', 'Christine Jacoba Aaftink',
       'Christine Jacoba Aaftink', 'Christine Jacoba Aaftink',
       ...
       'Fernando scar Zylberberg', 'James Francis "Jim" Zylker',
       'Aleksandr Viktorovich Zyuzin', 'Aleksandr Viktorovich Zyuzin',
       'Olga Igorevna Zyuzkova', 'Andrzej ya', 'Piotr ya', 'Piotr ya',
       'Tomasz Ireneusz ya', 'Tomasz Ireneusz ya'],
      dtype='object', name='Name', length=271116)

In [31]:
df.index.str.contains(' ya') # Filter rows where index contains ' ya'

array([False, False, False, ...,  True,  True,  True], shape=(271116,))

In [32]:
df.index.str.contains(' ya').sum() # Count how many index labels contain ' ya'

np.int64(6)

In [33]:
df.loc[df.index.str.contains(' ya')].head() # Filter rows based on index label condition

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Jrme yana,33566,M,23.0,185.0,80.0,France,FRA,2000 Summer,2000,Summer,Sydney,Athletics,Athletics Men's 4 x 100 metres Relay,
Andrzej ya,135569,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
Tomasz Ireneusz ya,135571,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


In [34]:
df.loc[df.index.str.contains(' ya') & ~df.index.str.contains('yana')] # Filter rows where index contains ' ya' but not 'yana'

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Andrzej ya,135569,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
Tomasz Ireneusz ya,135571,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,
Tomasz Ireneusz ya,135571,M,34.0,185.0,96.0,Poland,POL,2002 Winter,2002,Winter,Salt Lake City,Bobsleigh,Bobsleigh Men's Four,


### Integer-based indexing

In [35]:
df.iloc[0]  # Access the first row by integer index

ID                                  1
Sex                                 M
Age                              24.0
Height                          180.0
Weight                           80.0
Team                            China
NOC                               CHN
Games                     1992 Summer
Year                             1992
Season                         Summer
City                        Barcelona
Sport                      Basketball
Event     Basketball Men's Basketball
Medal                             NaN
Name: A Dijiang, dtype: object

In [36]:
df.iloc[0, 0]  # Access the first element by integer index

np.int64(1)

In [37]:
df.iloc[0:3]  # Access the first three rows by integer index

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
Gunnar Nielsen Aaby,3,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,


In [38]:
df.iloc[:, :1] # Access the first column by integer index

Unnamed: 0_level_0,ID
Name,Unnamed: 1_level_1
A Dijiang,1
A Lamusi,2
Gunnar Nielsen Aaby,3
Edgar Lindenau Aabye,4
Christine Jacoba Aaftink,5
...,...
Andrzej ya,135569
Piotr ya,135570
Piotr ya,135570
Tomasz Ireneusz ya,135571


### Boolean indexing

In [39]:
df[df['Age'] > 30]  # Filter rows where Age is greater than 30

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Edgar Lindenau Aabye,4,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
Per Knut Aaland,6,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 10 kilometres,
Per Knut Aaland,6,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 50 kilometres,
Per Knut Aaland,6,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 10/15 kilometres Pu...,
Per Knut Aaland,6,M,31.0,188.0,75.0,United States,USA,1992 Winter,1992,Winter,Albertville,Cross Country Skiing,Cross Country Skiing Men's 4 x 10 kilometres R...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ukasz Tomasz ygado,135558,M,32.0,200.0,89.0,Poland,POL,2012 Summer,2012,Summer,London,Volleyball,Volleyball Men's Volleyball,
Pawe Jan Zygmunt,135559,M,33.0,182.0,79.0,Poland,POL,2006 Winter,2006,Winter,Torino,Speed Skating,"Speed Skating Men's 5,000 metres",
Stavroula Zygouri,135560,F,36.0,171.0,63.0,Greece,GRE,2004 Summer,2004,Summer,Athina,Wrestling,"Wrestling Women's Middleweight, Freestyle",
Olga Igorevna Zyuzkova,135568,F,33.0,171.0,69.0,Belarus,BLR,2016 Summer,2016,Summer,Rio de Janeiro,Basketball,Basketball Women's Basketball,


In [40]:
df[df['Age'] == 30]  # Filter rows where Age is equal to 30

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Minna Maarit Aalto,13,F,30.0,159.0,55.5,Finland,FIN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Women's Windsurfer,
Arvo Ossian Aaltonen,15,M,30.0,,,Finland,FIN,1920 Summer,1920,Summer,Antwerpen,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
Arvo Ossian Aaltonen,15,M,30.0,,,Finland,FIN,1920 Summer,1920,Summer,Antwerpen,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
Kjetil Andr Aamodt,20,M,30.0,176.0,85.0,Norway,NOR,2002 Winter,2002,Winter,Salt Lake City,Alpine Skiing,Alpine Skiing Men's Downhill,
Kjetil Andr Aamodt,20,M,30.0,176.0,85.0,Norway,NOR,2002 Winter,2002,Winter,Salt Lake City,Alpine Skiing,Alpine Skiing Men's Super G,Gold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Jules Alexis ""Louis"" Zutter",135481,M,30.0,,,Switzerland,SUI,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Parallel Bars,Silver
"Jules Alexis ""Louis"" Zutter",135481,M,30.0,,,Switzerland,SUI,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Horizontal Bar,
"Jules Alexis ""Louis"" Zutter",135481,M,30.0,,,Switzerland,SUI,1896 Summer,1896,Summer,Athina,Gymnastics,Gymnastics Men's Pommelled Horse,Gold
Claudia Antoinette Zwiers,135535,F,30.0,181.0,78.0,Netherlands,NED,2004 Summer,2004,Summer,Athina,Judo,Judo Women's Half-Heavyweight,


In [41]:
df[(df['Age'] > 30) & (df['Team'] == 'Finland')]  # Filter rows where Age is greater than 30 and Team is 'Finland'

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Jyri Tapani Aalto,12,M,31.0,172.0,70.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Badminton,Badminton Men's Singles,
Minna Maarit Aalto,13,F,34.0,159.0,55.5,Finland,FIN,2000 Summer,2000,Summer,Sydney,Sailing,Sailing Women's Windsurfer,
Pirjo Hannele Aalto (Mattila-),14,F,32.0,171.0,65.0,Finland,FIN,1994 Winter,1994,Winter,Lillehammer,Biathlon,Biathlon Women's 7.5 kilometres Sprint,
Arvo Ossian Aaltonen,15,M,34.0,,,Finland,FIN,1924 Summer,1924,Summer,Paris,Swimming,Swimming Men's 200 metres Breaststroke,
Paavo Johannes Aaltonen,17,M,32.0,175.0,64.0,Finland,FIN,1952 Summer,1952,Summer,Helsinki,Gymnastics,Gymnastics Men's Individual All-Around,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vilho Ilmari Ylnen,133021,M,38.0,169.0,67.0,Finland,FIN,1956 Summer,1956,Summer,Melbourne,Shooting,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",
Vilho Ilmari Ylnen,133021,M,42.0,169.0,67.0,Finland,FIN,1960 Summer,1960,Summer,Roma,Shooting,"Shooting Men's Free Rifle, Three Positions, 30...",
Vilho Ilmari Ylnen,133021,M,42.0,169.0,67.0,Finland,FIN,1960 Summer,1960,Summer,Roma,Shooting,"Shooting Men's Small-Bore Rifle, Prone, 50 metres",
Vilho Ilmari Ylnen,133021,M,46.0,169.0,67.0,Finland,FIN,1964 Summer,1964,Summer,Tokyo,Shooting,"Shooting Men's Small-Bore Rifle, Three Positio...",


In [42]:
df[(df['Team'] == 'Finland') | (df['Team'] == 'Sweden')]  # Filter rows where Team is 'Finland' or 'Sweden'

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Antti Sami Aalto,9,M,26.0,186.0,96.0,Finland,FIN,2002 Winter,2002,Winter,Salt Lake City,Ice Hockey,Ice Hockey Men's Ice Hockey,
"Einar Ferdinand ""Einari"" Aalto",10,M,26.0,,,Finland,FIN,1952 Summer,1952,Summer,Helsinki,Swimming,Swimming Men's 400 metres Freestyle,
Jorma Ilmari Aalto,11,M,22.0,182.0,76.5,Finland,FIN,1980 Winter,1980,Winter,Lake Placid,Cross Country Skiing,Cross Country Skiing Men's 30 kilometres,
Jyri Tapani Aalto,12,M,31.0,172.0,70.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Badminton,Badminton Men's Singles,
Minna Maarit Aalto,13,F,30.0,159.0,55.5,Finland,FIN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Women's Windsurfer,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Carl Henrik Zetterberg,134391,M,21.0,183.0,86.0,Sweden,SWE,2002 Winter,2002,Winter,Salt Lake City,Ice Hockey,Ice Hockey Men's Ice Hockey,
Carl Henrik Zetterberg,134391,M,25.0,183.0,86.0,Sweden,SWE,2006 Winter,2006,Winter,Torino,Ice Hockey,Ice Hockey Men's Ice Hockey,Gold
Carl Henrik Zetterberg,134391,M,29.0,183.0,86.0,Sweden,SWE,2010 Winter,2010,Winter,Vancouver,Ice Hockey,Ice Hockey Men's Ice Hockey,
Carl Henrik Zetterberg,134391,M,33.0,183.0,86.0,Sweden,SWE,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Silver


## Data cleaning

In [43]:
df.isna().sum()  # Count missing values in each column
# df.isnull().sum()  # Count missing values in each column

ID             0
Sex            0
Age         9474
Height     60171
Weight     62875
Team           0
NOC            0
Games          0
Year           0
Season         0
City           0
Sport          0
Event          0
Medal     231333
dtype: int64

In [44]:
df = pd.read_csv('athlete_events.csv', index_col='Name')
df.shape

(271116, 14)

In [45]:
df.dropna(axis='columns')  # Drop rows with any missing values
# df.dropna(axis='columns', inplace=True)


Unnamed: 0_level_0,ID,Sex,Team,NOC,Games,Year,Season,City,Sport,Event
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A Dijiang,1,M,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball
A Lamusi,2,M,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight
Gunnar Nielsen Aaby,3,M,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football
Edgar Lindenau Aabye,4,M,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War
Christine Jacoba Aaftink,5,F,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres
...,...,...,...,...,...,...,...,...,...,...
Andrzej ya,135569,M,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles
Piotr ya,135570,M,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual"
Piotr ya,135570,M,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team"
Tomasz Ireneusz ya,135571,M,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four


In [46]:
df.dropna()  # Drop rows with any missing values

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Juhamatti Tapio Aaltonen,16,M,28.0,184.0,85.0,Finland,FIN,2014 Winter,2014,Winter,Sochi,Ice Hockey,Ice Hockey Men's Ice Hockey,Bronze
Paavo Johannes Aaltonen,17,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
Paavo Johannes Aaltonen,17,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
Paavo Johannes Aaltonen,17,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Horse Vault,Gold
Paavo Johannes Aaltonen,17,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Pommelled Horse,Gold
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Galina Ivanovna Zybina (-Fyodorova),135553,F,25.0,168.0,80.0,Soviet Union,URS,1956 Summer,1956,Summer,Melbourne,Athletics,Athletics Women's Shot Put,Silver
Galina Ivanovna Zybina (-Fyodorova),135553,F,33.0,168.0,80.0,Soviet Union,URS,1964 Summer,1964,Summer,Tokyo,Athletics,Athletics Women's Shot Put,Bronze
Bogusaw Zych,135554,M,28.0,182.0,82.0,Poland,POL,1980 Summer,1980,Summer,Moskva,Fencing,"Fencing Men's Foil, Team",Bronze
Olesya Nikolayevna Zykina,135563,F,19.0,171.0,64.0,Russia,RUS,2000 Summer,2000,Summer,Sydney,Athletics,Athletics Women's 4 x 400 metres Relay,Bronze


In [47]:
df.fillna(0)  # Fill missing values with 0

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,0
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,0
Gunnar Nielsen Aaby,3,M,24.0,0.0,0.0,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,0
Edgar Lindenau Aabye,4,M,34.0,0.0,0.0,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
Christine Jacoba Aaftink,5,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Andrzej ya,135569,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,0
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",0
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",0
Tomasz Ireneusz ya,135571,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,0


In [48]:
df['Medal'].fillna('No Medal', inplace=True)  # Fill missing values in 'Medal' column with 'No Medal'
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Medal'].fillna('No Medal', inplace=True)  # Fill missing values in 'Medal' column with 'No Medal'


Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,No Medal
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,No Medal
Gunnar Nielsen Aaby,3,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,No Medal
Edgar Lindenau Aabye,4,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
Christine Jacoba Aaftink,5,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,No Medal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Andrzej ya,135569,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,No Medal
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",No Medal
Piotr ya,135570,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",No Medal
Tomasz Ireneusz ya,135571,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,No Medal


In [49]:
df['Height'].fillna(df['Height'].mean(), inplace=True)  # Fill missing values in 'Height' column with the mean height
df['Weight'].fillna(df['Weight'].mean(), inplace=True)  # Fill missing values in 'Weight' column with the mean weight

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Height'].fillna(df['Height'].mean(), inplace=True)  # Fill missing values in 'Height' column with the mean height
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Weight'].fillna(df['Weight'].mean(), inplace=True)  # Fill missing values in 'Weight' column with the mean wei

In [50]:
df.head()  # Display the first few rows of the DataFrame

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A Dijiang,1,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,No Medal
A Lamusi,2,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,No Medal
Gunnar Nielsen Aaby,3,M,24.0,175.33897,70.702393,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,No Medal
Edgar Lindenau Aabye,4,M,34.0,175.33897,70.702393,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
Christine Jacoba Aaftink,5,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,No Medal


In [51]:
df.duplicated()  # Check for duplicate rows

Name
A Dijiang                   False
A Lamusi                    False
Gunnar Nielsen Aaby         False
Edgar Lindenau Aabye        False
Christine Jacoba Aaftink    False
                            ...  
Andrzej ya                  False
Piotr ya                    False
Piotr ya                    False
Tomasz Ireneusz ya          False
Tomasz Ireneusz ya          False
Length: 271116, dtype: bool

In [52]:
df.drop_duplicates(inplace=True)  # Drop duplicate rows in place