In [1]:
# Load the pandas package
import pandas as pd
# Load the numpy package
import numpy as np

### Working with a DataFrame <*creation*>
---

In [2]:
# Create a DataFrame

structure ={'Name':['Juan','Miguel','Jose','Mario'],'Marks':['10','8.5','8.9','9'],'Sports':['Baseball','Football','Basketball','Golf']}

df = pd.DataFrame(structure)

In [3]:
# Print new DataFrame
df

Unnamed: 0,Name,Marks,Sports
0,Juan,10.0,Baseball
1,Miguel,8.5,Football
2,Jose,8.9,Basketball
3,Mario,9.0,Golf


In [4]:
# Request info about DataFrame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Marks   4 non-null      object
 2   Sports  4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


In [5]:
# Request description

df.describe()

Unnamed: 0,Name,Marks,Sports
count,4,4,4
unique,4,4,4
top,Miguel,10,Basketball
freq,1,1,1


In [6]:
# DataFrame with NaN values

structure2 ={'Name':['Juan','Miguel','Jose','Mario'],'Marks':[np.nan,'8.5','8.9','9'],'Sports':['Baseball','Football','N/A','Golf']}

df2 = pd.DataFrame(structure2)

In [7]:
df2

Unnamed: 0,Name,Marks,Sports
0,Juan,,Baseball
1,Miguel,8.5,Football
2,Jose,8.9,
3,Mario,9.0,Golf


In [8]:
# Request info about DataFrame

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Marks   3 non-null      object
 2   Sports  4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


In [9]:
# Request description

df2.describe()

Unnamed: 0,Name,Marks,Sports
count,4,3.0,4
unique,4,3.0,4
top,Miguel,8.9,Football
freq,1,1.0,1


In [10]:
# Create new data frame to work with it 

new_df2 = pd.DataFrame(df2)

In [11]:
# Replace the nan values for 0

new_df2 = new_df2.replace(np.nan,"0")

In [12]:
# Verify the change

new_df2

Unnamed: 0,Name,Marks,Sports
0,Juan,0.0,Baseball
1,Miguel,8.5,Football
2,Jose,8.9,
3,Mario,9.0,Golf


In [13]:
# Drop all N/A values from dataframe

drop_df2 = pd.DataFrame(df2)
drop_df2

Unnamed: 0,Name,Marks,Sports
0,Juan,,Baseball
1,Miguel,8.5,Football
2,Jose,8.9,
3,Mario,9.0,Golf


In [14]:
drop_df2.dropna(how = 'any', inplace = True)

In [15]:
drop_df2

Unnamed: 0,Name,Marks,Sports
1,Miguel,8.5,Football
2,Jose,8.9,
3,Mario,9.0,Golf


In [16]:
# Compare the both previous dataframes

print(new_df2)
print("\n")
print(drop_df2)

     Name Marks    Sports
0    Juan     0  Baseball
1  Miguel   8.5  Football
2    Jose   8.9       N/A
3   Mario     9      Golf


     Name Marks    Sports
1  Miguel   8.5  Football
2    Jose   8.9       N/A
3   Mario     9      Golf


In [17]:
# Remove the N/A by column method 

drop_df2 = drop_df2[drop_df2['Sports'] != 'N/A']

In [18]:
# Verify it 

drop_df2

Unnamed: 0,Name,Marks,Sports
1,Miguel,8.5,Football
3,Mario,9.0,Golf


In [19]:
# Fill up DataFrame with NaN values

fill_df = pd.DataFrame(df2)

In [20]:
fill_df.fillna(0, inplace = True)

In [21]:
fill_df

Unnamed: 0,Name,Marks,Sports
0,Juan,0.0,Baseball
1,Miguel,8.5,Football
2,Jose,8.9,
3,Mario,9.0,Golf


In [22]:
# Verify the DataFrame

fill_df.describe()

Unnamed: 0,Name,Marks,Sports
count,4,4.0,4
unique,4,4.0,4
top,Miguel,8.5,Football
freq,1,1.0,1


In [23]:
# Convert the numbers to int values ()

fill_df['Marks'] = fill_df.Marks.astype(float)

In [24]:
# Request stats

fill_df.describe()

Unnamed: 0,Marks
count,4.0
mean,6.6
std,4.4053
min,0.0
25%,6.375
50%,8.7
75%,8.925
max,9.0


### Working with a DataFrame <*loading, exploration*>
---

In [27]:
# Load the dataset using pandas

atp_df = pd.read_csv("ATP_Men's_Tour.csv",encoding = "latin-1")

In [29]:
# Check the first 5 rows of DataFrame

atp_df.head(5)

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Dosedel S.,...,,,,,,,,,,
1,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,,,,,,,,,,
2,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Escude N.,...,,,,,,,,,,
3,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Federer R.,...,,,,,,,,,,
4,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Fromberg R.,...,,,,,,,,,,


In [31]:
# Check the columns that Dataframe has

atp_df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'W1', 'L1',
       'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets',
       'Comment', 'CBW', 'CBL', 'GBW', 'GBL', 'IWW', 'IWL', 'SBW', 'SBL',
       'B365W', 'B365L', 'B&WW', 'B&WL', 'EXW', 'EXL', 'PSW', 'PSL', 'WPts',
       'LPts', 'UBW', 'UBL', 'LBW', 'LBL', 'SJW', 'SJL', 'MaxW', 'MaxL',
       'AvgW', 'AvgL'],
      dtype='object')

In [33]:
# requets info about DataFrame

atp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46652 entries, 0 to 46651
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ATP         46652 non-null  int64  
 1   Location    46652 non-null  object 
 2   Tournament  46652 non-null  object 
 3   Date        46652 non-null  object 
 4   Series      46652 non-null  object 
 5   Court       46652 non-null  object 
 6   Surface     46652 non-null  object 
 7   Round       46652 non-null  object 
 8   Best of     46652 non-null  int64  
 9   Winner      46652 non-null  object 
 10  Loser       46652 non-null  object 
 11  WRank       46637 non-null  object 
 12  LRank       46581 non-null  object 
 13  W1          46423 non-null  float64
 14  L1          46423 non-null  float64
 15  W2          45962 non-null  float64
 16  L2          45963 non-null  float64
 17  W3          21365 non-null  float64
 18  L3          21365 non-null  float64
 19  W4          4226 non-null

In [32]:
# Quick stats

atp_df.describe()

Unnamed: 0,ATP,Best of,W1,L1,W2,L2,W3,L3,W4,L4,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
count,46652.0,46652.0,46423.0,46423.0,45962.0,45963.0,21365.0,21365.0,4226.0,4226.0,...,10672.0,10672.0,23230.0,23239.0,15572.0,15579.0,17104.0,17104.0,17104.0,17104.0
mean,32.931,3.373746,5.792667,4.043211,5.772856,3.853621,6.018254,3.592745,5.777094,3.87151,...,1.815797,3.542452,1.795751,3.510855,1.795808,3.558671,1.991862,9.824993,1.82006,3.706502
std,17.953268,0.780315,1.239261,1.847833,1.253846,1.878397,0.936149,1.767542,1.264913,1.891446,...,0.996191,3.646138,1.023604,3.175995,1.003732,3.272284,1.68459,458.068834,1.117214,3.455871
min,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.01,1.02,1.0,1.0,1.0,1.01,1.01,1.01,1.01,1.01
25%,19.0,3.0,6.0,3.0,6.0,2.0,6.0,2.0,6.0,2.0,...,1.24,1.75,1.222,1.73,1.22,1.73,1.27,1.86,1.22,1.75
50%,33.0,3.0,6.0,4.0,6.0,4.0,6.0,4.0,6.0,4.0,...,1.5,2.5,1.5,2.62,1.5,2.63,1.56,2.85,1.48,2.6
75%,49.0,3.0,6.0,6.0,6.0,6.0,6.0,5.0,6.0,6.0,...,2.03,3.85,2.0,4.0,2.0,4.0,2.2,4.74,2.03,4.09
max,69.0,5.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,18.0,60.0,26.0,51.0,19.0,81.0,76.0,42586.0,23.45,36.44


In [34]:
# Change the NaN values from DataFrame and become them to 0

cpy_atp_df = pd.DataFrame(atp_df)

In [35]:
cpy_atp_df = cpy_atp_df.replace(np.nan,"0")

In [37]:
# Compare the both DataFrame

print(cpy_atp_df.info())
print("\n")
print(atp_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46652 entries, 0 to 46651
Data columns (total 54 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ATP         46652 non-null  int64 
 1   Location    46652 non-null  object
 2   Tournament  46652 non-null  object
 3   Date        46652 non-null  object
 4   Series      46652 non-null  object
 5   Court       46652 non-null  object
 6   Surface     46652 non-null  object
 7   Round       46652 non-null  object
 8   Best of     46652 non-null  int64 
 9   Winner      46652 non-null  object
 10  Loser       46652 non-null  object
 11  WRank       46652 non-null  object
 12  LRank       46652 non-null  object
 13  W1          46652 non-null  object
 14  L1          46652 non-null  object
 15  W2          46652 non-null  object
 16  L2          46652 non-null  object
 17  W3          46652 non-null  object
 18  L3          46652 non-null  object
 19  W4          46652 non-null  object
 20  L4    

In [38]:
# Quick stats of copy DataFrame

cpy_atp_df.describe()

Unnamed: 0,ATP,Best of
count,46652.0,46652.0
mean,32.931,3.373746
std,17.953268,0.780315
min,1.0,-1.0
25%,19.0,3.0
50%,33.0,3.0
75%,49.0,3.0
max,69.0,5.0


In [39]:
# Check the 10 first rows 

cpy_atp_df.head(10)

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,UBW,UBL,LBW,LBL,SJW,SJL,MaxW,MaxL,AvgW,AvgL
0,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Dosedel S.,...,0,0,0,0,0,0,0,0,0,0
1,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Enqvist T.,...,0,0,0,0,0,0,0,0,0,0
2,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Escude N.,...,0,0,0,0,0,0,0,0,0,0
3,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Federer R.,...,0,0,0,0,0,0,0,0,0,0
4,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Fromberg R.,...,0,0,0,0,0,0,0,0,0,0
5,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Gambill J.M.,...,0,0,0,0,0,0,0,0,0,0
6,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Grosjean S.,...,0,0,0,0,0,0,0,0,0,0
7,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Henman T.,...,0,0,0,0,0,0,0,0,0,0
8,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Hewitt L.,...,0,0,0,0,0,0,0,0,0,0
9,1,Adelaide,Australian Hardcourt Championships,3/01/2000,International,Outdoor,Hard,1st Round,3,Lisnard J.,...,0,0,0,0,0,0,0,0,0,0


In [40]:
# Remove all NaN values and NR vales from copy DataFrame 

cpy_atp_df = cpy_atp_df.replace("N/A","0")

cpy_atp_df = cpy_atp_df.replace("NR","0")

In [41]:
# Check the stats again

cpy_atp_df.describe()

Unnamed: 0,ATP,Best of
count,46652.0,46652.0
mean,32.931,3.373746
std,17.953268,0.780315
min,1.0,-1.0
25%,19.0,3.0
50%,33.0,3.0
75%,49.0,3.0
max,69.0,5.0


In [None]:
# Convert the float values to int values

