In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt

# Import PCA from sci-kit learn
from sklearn.decomposition import PCA
import hvplot.pandas

In [None]:
df = pd.read_csv('./Resources/marathon_results_2019.csv')
df.head()

In [None]:
# Subset the dataframe to only the columns "Age", "M/F", split times (i.e. "5K, 10K", etc.), "Pace", and "Official Time". 
df = df[['Age', 'M/F', '5K', '10K', '15K', '20K', 'Half', '25K', '30K', '35K', '40K', 'Pace', 'Official Time']]
df.head()

In [None]:
# Convert the split times, "Pace", and "Official Time" to timedeltas using apply() and pandas.to_timedelta()
time_columns = ['5K', '10K', '15K', '20K', 'Half','25K', '30K', '35K', '40K', 'Pace', 'Official Time']
df[time_columns] = df[time_columns].apply(pd.to_timedelta)
df.head()

In [None]:
# Convert timedeltas to seconds by applying the lambda function lambda x: x.dt.total_seconds()
df[time_columns] = df[time_columns].apply(lambda x: x.dt.total_seconds())
df.head()

In [None]:
# Subset df to only rows where the split values are non-zero
df = df[
    (df['5K'] != 0) & 
    (df['10K'] != 0) & 
    (df['15K'] != 0) & 
    (df['20K'] != 0) & 
    (df['Half'] != 0) & 
    (df['25K'] != 0) & 
    (df['30K'] != 0) & 
    (df['35K'] != 0) & 
    (df['40K'] != 0)
]

# A more concise method
#df[~(df == 0).any(axis=1)]

df.head()

In [None]:
# Use LabelEncoder to convert 'M/F' into integer labels
df['M/F'] = LabelEncoder().fit_transform(df['M/F'])

# Equivalently, the encoder can be created and applied in two steps
# encoder = LabelEncoder().fit(df['M/F'])
# df['M/F'] = encoder.transform(df['M/F'])

df.head()

In [None]:
# Convert 'Age' to a numeric value
df['Age'] = pd.to_numeric(df['Age'])

In [None]:
df.dropna(inplace = True)

In [None]:
# 'Pace' should be perfectly correlated with 'Official Time', so we'll remove it from our X data (but keep it for analysis)
# Plot a scatter plot of 'Pace' against 'Official Time'
df.plot(kind='scatter', x='Pace', y='Official Time')

In [None]:
# Create a training set 'X' with every column except 'Pace'
X = df.drop('Pace', axis=1)

# Alternatively, select all the columns to keep
#X = df[['Age', 'M/F','5K','10K','15K','20K','25K','30K','35K','40K','Half','Official Time']]

In [None]:
# Scale the dataset using MinMaxScaler()
X_scaled = MinMaxScaler().fit_transform(X)
X_scaled

In [None]:
sse = {}
K = range(1,10)
for k in K:
    kmeanmodel = KMeans(n_clusters=k).fit(X_scaled)
    sse[k]= kmeanmodel.inertia_
    
# Plot
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel('k')
plt.ylabel('SSE')
plt.title('Elbow Method')
plt.show()

In [None]:
# Create a KMeans model with 3 clusters
model = KMeans(n_clusters=3, random_state=42).fit(X_scaled)

In [None]:
# Calculate predicted values.
y_pred = model.predict(X_scaled)

In [None]:
# Add prediected values onto the original dataframe
df_y = pd.DataFrame(y_pred, columns=['Cluster'])
combined = df.join(df_y, how='inner')
combined.head()

In [None]:
# Plot boxplots of the official time distributions by cluster
combined.boxplot(['Official Time'], by=['Cluster'])

In [None]:
# Print out summary statistics of ages by M/F and Cluster
combined.groupby(['M/F','Cluster']).describe()['Age']

In [None]:
# Create a function that takes in gender and age and assigns an age group based on the following break points for each gender:
# * The lowest 1st quartile
# * Each median
# * The highest 3rd quartile
def age_group(gender, age):
    if gender == 0:
        if age < 29:
            return 0
        elif age < 36:
            return 1
        elif age < 41:
            return 2
        elif age < 45:
            return 3
        elif age < 51:
            return 4
        else:
            return 5
    if gender == 1:
        if age < 33:
            return 0
        elif age < 40:
            return 1
        elif age < 48:
            return 2
        elif age < 53:
            return 3
        elif age < 60:
            return 4
        else:
            return 5

In [None]:
# Apply the custom age group to the original data frame and save it to the column 'Age Group'
df['Age Group'] = df.apply(lambda row: age_group(row['M/F'], row['Age']), axis=1)
df.head()

### Part 3: PCA

In [None]:
# Create a new training set 'X' by dropping 'Pace' and 'Age' from df
X = df.drop(['Age','Pace'], axis=1)

In [None]:
# Scale the dataset using MinMaxScaler()
X_scaled = MinMaxScaler().fit_transform(X)
X_scaled

In [None]:
# Reduce the number of components in X to 2 using PCA
pca = PCA(n_components=2)
pca.fit(X_scaled)
print(pca.explained_variance_ratio_)

X_pca = pca.transform(X_scaled)

In [None]:
df_pca = pd.DataFrame(X_pca, columns=['principal component 1','principal component 2'])

In [None]:
df_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2"
)

In [None]:
sse = {}
K = range(1,10)
for k in K:
    kmeanmodel = KMeans(n_clusters=k).fit(X_pca)
    sse[k]= kmeanmodel.inertia_
    
# Plot
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel('k')
plt.ylabel('SSE')
plt.title('Elbow Method')
plt.show()

In [None]:
# Create a KMeans model with 4 clusters
model = KMeans(n_clusters=4, random_state=42).fit(X_pca)

In [None]:
# Calculate predicted values.
y_pred = model.fit_predict(X_pca)

In [None]:
# Add predicted values to df_pca and replot the scatter plot of the PCA components, coloring by predicted cluster
df_pca["cluster"] = y_pred
df_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    by="cluster"
)

In [None]:
# Add prediected values onto the original dataframe
df_y = pd.DataFrame(y_pred, columns=['Cluster'])
combined = df.join(df_y, how='inner')
combined.head()

In [None]:
# Show a boxplot of Pace by M/F and cluster
combined.boxplot(['Pace'], by=['M/F','Cluster'])

In [None]:
# Display summary statistics on Pace grouped by M/F and cluster
combined.groupby(['M/F','Cluster']).describe()['Pace']