In [5]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.preprocessing import QuantileTransformer

# Load the data
df = pd.read_csv('Drumsticks - CleanedGroundIDWithData.csv')

# Stage 1: Group-wise PCA
group_pipelines = {
    'venue': PCA(n_components=3),       # Home/Away/Neutral stats
    'toss': PCA(n_components=4),        # Toss-related metrics
    'innings': PCA(n_components=2),     # Innings progression
    'lighting': PCA(n_components=2)     # Match timing
}

# Feature Engineering
df['HomeAdvantageIndex'] = (df['HomeWinLossRatio'] * 0.4 + 
                           df['HomeAverage'] * 0.3 + 
                           df['HomeRPO'] * 0.3)

df['TossImpactQuotient'] = (
    (df['WonTossBattedWinLossRatio'] - df['LostTossFieldedWinLossRatio']) * 
    np.log1p(df['WonTossBattedAverage'])
)

df['LightingDelta'] = (
    df['DayNightMatchRPO'] - 
    (df['DayMatchRPO'] + df['NightMatchRPO']) / 2
)

# Scaling the data
qt = QuantileTransformer(output_distribution='normal')
scaled_data = qt.fit_transform(df.drop('ID', axis=1))

# Apply PCA for each group and concatenate results
pca_results = []
for key, pipeline in group_pipelines.items():
    pca_result = pipeline.fit_transform(scaled_data)
    pca_results.append(pca_result)

# Concatenate PCA results
pca_concatenated = np.concatenate(pca_results, axis=1)

# Stage 2: Unified UMAP
final_umap = UMAP(n_components=8, 
                  metric='correlation', 
                  n_neighbors=15)

# Generate final embeddings
embeddings = final_umap.fit_transform(pca_concatenated)

# Create a new DataFrame to keep only the embeddings
embeddings_df = pd.DataFrame(embeddings, columns=[f'Embedding_{i+1}' for i in range(embeddings.shape[1])])

# Save the final DataFrame with embeddings
embeddings_df.to_csv('GroundIDWithData_Embeddings.csv', index=False)

