### Consistency Score = w1 * (Inns / Mat) + w2 * (NO / Inns) + w3 * (Runs / BF)  + w4 * Ave + w5 * ((50s + 2*100s) / Inns) - w6 * (Zeros / Inns)


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def analyze_and_visualize(df):
    # Select Features for Analysis
    features = ['Mat', 'Inns', 'NO', 'HS', 'Runs', 
                'BF', 'Ave', "100's", "50's", "0's", '4s', '6s']
    df_analysis = df[features]
    
    # Replace infinities and NaN with 0
    df_analysis = df_analysis.replace([np.inf, -np.inf], 0)
    df_analysis = df_analysis.fillna(0)
    

    # Correlation Analysis
    correlation_matrix = df_analysis.corr()
    
    # Visualize Correlation Matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Features')
    plt.show()
    
    return df_analysis

In [3]:
def calculate_batting_consistency(df):
    df_components = df.compy()
    df_components["Inns_Per_Match"] = df_components["Inns"] / df_components["Mat"]
    df_components["No_Outs_Per_Inns"] = df_components["NO"] / df_components["Inns"]
    df_components["Runs_Per_Ball"] = df_components["Runs"] / df_components["BF"]
    df_components["Ave"] = df_components["Runs"] / df_components["Inns"]
    df_components["Hundreds_Per_Inns"] = df_components["100's"] / df_components["Inns"]
    df_components["Fifties_Per_Inns"] = df_components["50's"] / df_components["Inns"]
    
    return df_components

In [4]:
data = pd.read_excel("../../all seasons/BattingDataset.xlsx")
df = data.copy()
df.replace('-', 0, inplace=True)
print(df.dtypes)

Player     object
Span       object
Mat         int64
Inns        int64
NO          int64
Runs        int64
HS          int64
Ave       float64
BF          int64
SR        float64
100’s       int64
50’s        int64
0’s         int64
4s          int64
6s          int64
dtype: object


  df.replace('-', 0, inplace=True)


In [5]:
numeric_cols = ['Inns', 'Mat', 'Overs', 'NO', 'Runs', 'Wkts', 'Ave', 'Econ', 'SR', '4W', '5W'] # Added Mdns
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce') # Use df here

# Verify data types AFTER explicit conversion
print("\nData types of your columns AFTER explicit numeric conversion:")
print(df.dtypes) # Use df here

# --- Correctly Remove Rows with NaN Values ---
cols_to_check_nan = ['Inns', 'Mat', 'Overs', 'Wkts', 'Ave', 'Econ', 'SR', '4W', '5W'] # Added Mdns if relevant, keep consistent
print("\nNumber of NaN values BEFORE removing:")
print(df[cols_to_check_nan].isna().sum()) # Check NaN counts in relevant columns before removal

df_cleaned = df.dropna(subset=cols_to_check_nan, how='any') # Use how='any' to drop row if ANY of these cols is NaN
print("\nNumber of rows BEFORE removing NaNs:", len(df))
print("Number of rows AFTER removing NaNs:", len(df_cleaned)) # Print length of df_cleaned


# Calculate bowler formula components using the CLEANED data (df_cleaned)
calculations_with_data = calculate_batting_consistency(df_cleaned) # Pass df_cleaned

# Select the component columns for correlation analysis
component_columns = [
    'Inns_per_Mat',
    'Overs_per_Mat',
    'Wkts_per_Overs',
    'Ave_Score',
    'Econ_Rate',
    'SR_per_100',
    '4W5W_per_Mat'
]

# Calculate the correlation matrix for the components
correlation_matrix = calculations_with_data[component_columns].corr() # Use calculations_with_data

# Display the correlation matrix
print("\nCorrelation Matrix of Bowler Consistency Formula Components (after NaN removal):")
print(correlation_matrix)

KeyError: 'Overs'