In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# Load Data
dataset_path = '/kaggle/input/odi-prediction/ODI Cricket Data new.csv'
df = pd.read_csv(dataset_path)

# Display basic information about the dataset
print("Dataset Overview:")
print(df.info())
print(df.head())

# Data Cleaning
# Convert problematic columns to numeric, handling errors
df['strike_rate'] = pd.to_numeric(df['strike_rate'], errors='coerce')
df['average'] = pd.to_numeric(df['average'], errors='coerce')

# Ensure 'percentage' column is properly converted
df['percentage'] = df['percentage'].astype(str).str.replace(r'[^0-9.]', '', regex=True)
df['percentage'] = pd.to_numeric(df['percentage'], errors='coerce')

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Encode all categorical variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert all values to string before encoding
    label_encoders[col] = le

# Standardize numerical features
scaler = StandardScaler()
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Compute correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Apply PCA
pca = PCA(n_components=5)  # Adjust components as needed
principal_components = pca.fit_transform(df[numeric_columns])
pca_df = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(5)])

# Explained Variance Ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Plot PCA explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, 6), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.show()