## Working with Summary Statistics

In [1]:
import pandas as pd

In [2]:
# https://trends.google.com/trends/explore?date=all&geo=US&q=Golf,Soccer,Tennis,Hockey,Baseball
# Read in file
sports = pd.read_csv("/datasets/sport/sport.csv", skiprows=2)

# Reset column names
col_names = ['Month', 'Golf', 'Soccer', 'Tennis', 'Hockey', 'Baseball']
sports.columns = col_names

sports.head()

Unnamed: 0,Month,Golf,Soccer,Tennis,Hockey,Baseball
0,2004-01,45,21,13,21,25
1,2004-02,49,23,13,22,32
2,2004-03,64,27,16,22,44
3,2004-04,81,29,16,16,54
4,2004-05,82,29,17,14,53


In [3]:
! ls /datasets/sport

sport.csv


In [4]:
# Set index
sports.set_index('Month', inplace=True)

In [5]:
# Gives you the rows and columns
sports.shape

(207, 5)

In [6]:
# Obtain the descriptive (summary) statistics
sports.describe()

Unnamed: 0,Golf,Soccer,Tennis,Hockey,Baseball
count,207.0,207.0,207.0,207.0,207.0
mean,48.63285,27.714976,13.183575,14.371981,33.125604
std,17.792234,9.20642,4.109558,5.969322,14.171751
min,24.0,9.0,6.0,4.0,12.0
25%,34.0,22.0,10.0,10.0,21.5
50%,47.0,27.0,12.0,14.0,29.0
75%,59.0,31.0,16.0,18.0,45.5
max,100.0,83.0,27.0,55.0,62.0


In [7]:
# Transpose data to flip rows and columns
sports.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Golf,207.0,48.63285,17.792234,24.0,34.0,47.0,59.0,100.0
Soccer,207.0,27.714976,9.20642,9.0,22.0,27.0,31.0,83.0
Tennis,207.0,13.183575,4.109558,6.0,10.0,12.0,16.0,27.0
Hockey,207.0,14.371981,5.969322,4.0,10.0,14.0,18.0,55.0
Baseball,207.0,33.125604,14.171751,12.0,21.5,29.0,45.5,62.0


In [8]:
# Give you the unique values, excludes NaNs
sports['Golf'].unique()

array([ 45,  49,  64,  81,  82, 100,  93,  85,  66,  48,  40,  51,  63,
        97,  92,  86,  42,  44,  47,  57,  79,  78,  90,  89,  41,  56,
        71,  77,  62,  46,  35,  36,  39,  43,  52,  70,  75,  38,  31,
        33,  73,  67,  37,  32,  29,  60,  61,  59,  27,  30,  34,  58,
        55,  25,  26,  54,  53,  24,  28,  50])

In [9]:
# Minimum values
sports.min()

Golf        24
Soccer       9
Tennis       6
Hockey       4
Baseball    12
dtype: int64

In [10]:
# Maximum values
sports.max()

Golf        100
Soccer       83
Tennis       27
Hockey       55
Baseball     62
dtype: int64

### Measures of Central Tendency

In [11]:
# Mean is the average value of the data set.  It represents the typical value.
sports.mean()

Golf        48.632850
Soccer      27.714976
Tennis      13.183575
Hockey      14.371981
Baseball    33.125604
dtype: float64

In [12]:
# Median value is the middle value after you reorganize the data set in ascending order.
# If there are an even number of observations, then we take the average of the 2 middle values.
sports.median()

Golf        47.0
Soccer      27.0
Tennis      12.0
Hockey      14.0
Baseball    29.0
dtype: float64

In [13]:
# Mode is defined as the value most frequent in our data
# If a value appears repeatedly in the data, it will influence the average
# towards that frequent value
# Modal value is like a highly weighted contributing factor for the mean value
sports['Golf'].mode()

0    29
dtype: int64

In [14]:
sports['Tennis'].mode()

0    12
dtype: int64

### Ranges and Percentiles

In [15]:
# Range is one indicator of spread
# To calculate range, you subtract the smallest value from the largest value
range = sports.max() - sports.min()
range

Golf        76
Soccer      74
Tennis      21
Hockey      51
Baseball    50
dtype: int64

In [16]:
# Percentile values in summary, include 10% and 90%
sports.describe(percentiles=[.1, .25, .5, .75, .9])

Unnamed: 0,Golf,Soccer,Tennis,Hockey,Baseball
count,207.0,207.0,207.0,207.0,207.0
mean,48.63285,27.714976,13.183575,14.371981,33.125604
std,17.792234,9.20642,4.109558,5.969322,14.171751
min,24.0,9.0,6.0,4.0,12.0
10%,28.0,18.6,8.6,8.0,16.0
25%,34.0,22.0,10.0,10.0,21.5
50%,47.0,27.0,12.0,14.0,29.0
75%,59.0,31.0,16.0,18.0,45.5
90%,75.8,36.0,19.0,20.0,54.4
max,100.0,83.0,27.0,55.0,62.0


In [17]:
from scipy.stats import iqr

# Calculate Interquartile Range (IQR), 75% - 25%
IQR = iqr(sports['Golf'])
IQR

25.0

### Measures of Spread

In [18]:
# Standard deviation tells you how much your data point deviates from the mean.
# Standard deviation is a measure of spread. It has the same units as your data. 
# How to calculate standard deviation

# step 1:  For each observations, subtract away the mean value
diff = sports['Golf'] - sports['Golf'].mean()

# step 2: Square each difference
square_diff = diff**2

# step 3: Sum up all of these squared differences
sum_squared = square_diff.sum()

# step 4: Divide sum by the total number of observations minus 1
total_divided = sum_squared/(191 - 1)

# step 5: Square root of the result
squareroot_result = total_divided**0.5

# standard deviation, units are the same as the original data
squareroot_result

18.52623980059769

In [19]:
# Variance is a measure of spread, similar to standard deviation.
# To calculate variance, take steps 1-4 from standard deviation
# Units are interpreted in terms of squared units, which is not easy to understand or to interpret
# How to calculate variance

# step 1:  For each observations, subtract away the mean value
diff = sports['Golf'] - sports['Golf'].mean()

# step 2: Square each difference
square_diff = diff**2

# step 3: Sum up all of these squared differences
sum_squared = square_diff.sum()

# step 4: Divide sum by the total number of observations minus 1
variance = sum_squared/(191 - 1)

print(variance)

343.22156114924996


In [20]:
# end

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0be58e23-60c5-40af-9cb7-633ba8900837' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>