In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as pyo

# Load data
content = []
with open('GK.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        content.append(row)

# Convert to DataFrame
df = pd.DataFrame(content)

# Replace commas and convert to numeric
numeric_columns = ['ShotsOnTargetAgainst', 'Saves', 'ShortPassesAttempted', 'ThrowsAttempted', 'GKAttempted',
                    'PostShotExpectedGoals-AllowedGoals/90minutes', 'Save%', 'GoalsAgainst/90', 'LongPassesCompleted%',
                    'MatchesPlayed', 'Minutes', 'CrossesStopped', 'Cmp', 'DefenseOutsidePenalyArea/90min']
df[numeric_columns] = df[numeric_columns].replace({',': ''}, regex=True).apply(pd.to_numeric, errors='coerce')

# Filter goalkeepers based on matches played and minutes played
filtered_df = df[(df['MatchesPlayed'] > 30) & (df['Minutes'] > 2400) & (df['PostShotExpectedGoals-AllowedGoals/90minutes'] > 0)]

# Create a copy of the filtered DataFrame
filtered_df_copy = filtered_df.copy()

# Perform calculations using .loc
filtered_df_copy.loc[:, 'ShotsOnTargetAgainst/Saves'] = filtered_df_copy['ShotsOnTargetAgainst'] / filtered_df_copy['Saves']
filtered_df_copy.loc[:, 'TotalAttempts'] = filtered_df_copy['ShortPassesAttempted'] + filtered_df_copy['ThrowsAttempted'] + filtered_df_copy['GKAttempted']

# Define features and include player names
features = ['TotalAttempts', 'PostShotExpectedGoals/90min', 'ShotsOnTargetAgainst/Saves', 'CrossesStopped', 
            'Save%', 'LongPassesCompleted%', 'Cmp', 'DefenseOutsidePenalyArea/90min', 'GoalsAgainst/90', 
            'PostShotExpectedGoals/ShotOnTarget', 'AveragePassLength']
dfskills = filtered_df[features].copy()
dfskills['Player'] = filtered_df['Player']  # Include player names

# Perform PCA
pca = PCA().fit(dfskills[features])
pcaratio = pca.explained_variance_ratio_
trace = go.Scatter(x=np.arange(len(pcaratio)), y=np.cumsum(pcaratio))
data = [trace]
layout = dict(title="Player Skills Dataset - PCA Explained Variance || 89% achieved at 5 components")
fig = dict(data=data, layout=layout)
pyo.iplot(fig)


FileNotFoundError: [Errno 2] No such file or directory: 'GK.csv'

In [3]:
# Apply PCA with 5 components
pca = PCA(n_components=5)
skillsPCA = pca.fit_transform(dfskills[features])
kmeans = KMeans(n_clusters=6)
skillsPCA_labels = kmeans.fit_predict(skillsPCA)

# Create DataFrame for PCA and KMeans
dfskillsPCA = pd.DataFrame(skillsPCA, columns=[f'PC{i+1}' for i in range(skillsPCA.shape[1])])
dfskillsPCA['cluster'] = skillsPCA_labels
dfskillsPCA['Player'] = dfskills['Player']

# Apply t-SNE
Xtsne = TSNE(n_components=2).fit_transform(dfskillsPCA.drop(['cluster', 'Player'], axis=1))
dftsne = pd.DataFrame(Xtsne, columns=['x1', 'x2'])
dftsne['cluster'] = dfskillsPCA['cluster']
dftsne['Player'] = dfskillsPCA['Player']

# Apply PCA for 2D visualization
pca2 = PCA(n_components=2)
skillsPCA2 = pca2.fit_transform(dfskills[features])
dfskillsPCA2 = pd.DataFrame(skillsPCA2, columns=['x1', 'x2'])
dfskillsPCA2['cluster'] = skillsPCA_labels
dfskillsPCA2['Player'] = dfskills['Player']

# Visualize using TSNE and PCA
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot(data=dftsne, x='x1', y='x2', hue='cluster', legend="full", alpha=0.5, ax=ax[0])
ax[0].set_title('Visualized on TSNE 2D')
for i in range(dftsne.shape[0]):
    ax[0].text(dftsne.iloc[i]['x1'], dftsne.iloc[i]['x2'], dftsne.iloc[i]['Player'], fontsize=8)
sns.scatterplot(data=dfskillsPCA2, x='x1', y='x2', hue='cluster', legend="full", alpha=0.5, ax=ax[1])
ax[1].set_title('Visualized on PCA 2D')
for i in range(dfskillsPCA2.shape[0]):
    ax[1].text(dfskillsPCA2.iloc[i]['x1'], dfskillsPCA2.iloc[i]['x2'], dfskillsPCA2.iloc[i]['Player'], fontsize=8)
fig.suptitle('Comparing clustering result when visualized using TSNE2D vs. PCA2D')
plt.show()

# Compare clustering results
kmeans = KMeans(n_clusters=6)
clustering_ori = kmeans.fit_predict(dfskills[features])
dftsne2D = dftsne.copy()
dftsne2D['cluster'] = clustering_ori

XtsneFull = TSNE(n_components=2).fit_transform(dfskills[features])
dftsneFull = pd.DataFrame(XtsneFull, columns=['x1', 'x2'])
dftsneFull['cluster'] = clustering_ori
dftsneFull['Player'] = dfskills['Player']

# Visualize using TSNE
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot(data=dftsne2D, x='x1', y='x2', hue='cluster', legend="full", alpha=0.5, ax=ax[0])
ax[0].set_title('Visualized on TSNE 5D>2D')
for i in range(dftsne2D.shape[0]):
    ax[0].text(dftsne2D.iloc[i]['x1'], dftsne2D.iloc[i]['x2'], dftsne2D.iloc[i]['Player'], fontsize=8)
sns.scatterplot(data=dftsneFull, x='x1', y='x2', hue='cluster', legend="full", alpha=0.7, ax=ax[1])
ax[1].set_title('Visualized on TSNE 34D>2D')
for i in range(dftsneFull.shape[0]):
    ax[1].text(dftsneFull.iloc[i]['x1'], dftsneFull.iloc[i]['x2'], dftsneFull.iloc[i]['Player'], fontsize=8)
fig.suptitle('Comparing clustering result when visualized using TSNE 5D>2D vs. TSNE 34D>2D')
plt.show()


NameError: name 'dfskills' is not defined