In [80]:
combined_df = pd.read_csv('clean_combined_df.csv')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline

# Assuming 'combined_df' is already loaded into 'data'
data = combined_df.copy()

# Separate 'Player' and 'ps' columns before processing
player_names = data[['Player', 'ps']]

# Identify categorical and numerical features (excluding 'Player' and 'ps')
categorical_features = data.select_dtypes(include=['object']).columns.drop(['Player'])
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns.drop(['ps'])

# Define preprocessing pipelines for both numeric and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler(with_mean=False))  # Keep data in sparse format
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))  # Output sparse matrix
])

# Bundle preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to data excluding 'Player' and 'ps'
data_features = data.drop(columns=['Player', 'ps'])

# Separate professional and school players
professional_players = data[data['ps'] == 0]
school_players = data[data['ps'] == 1]

# Fit and transform the professional players' data
X_prof = preprocessor.fit_transform(professional_players.drop(columns=['Player', 'ps']))

# Transform the school players' data
X_school = preprocessor.transform(school_players.drop(columns=['Player', 'ps']))

# Reduce dimensionality with TruncatedSVD (efficient for sparse data)
svd = TruncatedSVD(n_components=32, random_state=42)
X_prof_svd = svd.fit_transform(X_prof)
X_school_svd = svd.transform(X_school)

# Use NearestNeighbors to find the closest professional player for each school player
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(X_prof_svd)
distances, indices = nbrs.kneighbors(X_school_svd)

# Create matched pairs
matched_pairs = pd.DataFrame({
    'school_player': school_players['Player'].values,
    'matched_professional_player': professional_players['Player'].values[indices.flatten()]
})

# Display the first few matched pairs
print(matched_pairs.head())


In [None]:
matched_pairs

In [None]:
avg_distance = np.mean([distances[i, closest_professional_indices[i]] for i in range(len(school_features))])
print(f'Average Distance to Matched Professional Players: {avg_distance}')


In [None]:
# Reconstruct the professional players' data
reconstructed_prof = autoencoder.predict(X_prof)

# Calculate reconstruction error for professional players
professional_reconstruction_error = np.mean(np.square(X_prof - reconstructed_prof), axis=1)

# Calculate average reconstruction error for professional players
avg_professional_reconstruction_error = np.mean(professional_reconstruction_error)
print(f'Average Reconstruction Error for Professional Players: {avg_professional_reconstruction_error:.4f}')

# Reconstruct the school players' data
reconstructed_school = autoencoder.predict(X_school)

# Calculate reconstruction error for school players
school_reconstruction_error = np.mean(np.square(X_school - reconstructed_school), axis=1)

# Calculate average reconstruction error for school players
avg_school_reconstruction_error = np.mean(school_reconstruction_error)
print(f'Average Reconstruction Error for School Players: {avg_school_reconstruction_error:.4f}')

# Optional: Compare Errors
if avg_school_reconstruction_error > avg_professional_reconstruction_error:
    print("School players have a higher reconstruction error, indicating they are different from professional players in the learned feature space.")
else:
    print("School players have a similar reconstruction error to professional players, indicating good representation in the learned feature space.")
