In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
url = 'https://media.githubusercontent.com/media/pAppr69/Win-Factor_yashjeet/refs/heads/main/mw_pw_profiles.csv'
df = pd.read_csv(url)

In [5]:
def get_info(df):
  info_dict = {
      "Column_Name": df.columns,
      "Data_Type": df.dtypes,
      "NaN_Percentage": (df.isna().sum() / len(df) * 100).round(2)
  }
  return pd.DataFrame(info_dict)
get_info(df)

Unnamed: 0,Column_Name,Data_Type,NaN_Percentage
player_id,player_id,object,0.0
match_id,match_id,object,0.0
gender,gender,object,0.0
balls_per_over,balls_per_over,int64,0.0
start_date,start_date,object,0.0
series_name,series_name,object,0.47
match_type,match_type,object,0.0
name_x,name_x,object,0.0
runs_scored,runs_scored,int64,0.0
player_out,player_out,int64,0.0


In [7]:
df = df[['player_id','start_date','gender','series_name','catches_taken', 'run_out_direct', 'run_out_throw', 'stumpings_done', 'balls_bowled',
              'runs_conceded', 'wickets_taken', 'maidens', 'dot_balls_as_bowler', 'dot_balls_as_batsman','player_team','opposition_team',
              'runs_scored', 'balls_faced', 'fours_scored', 'sixes_scored','fantasy_score_bowling', 'fantasy_score_batting', 'fantasy_score_total']]

In [9]:
df = df.dropna()
# df['strike_rate'] = (df['runs_scored'] / df['balls_faced']) * 100
# df['economy'] = (df['runs_conceded'] / df['balls_bowled'])


In [11]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['player_id','gender', 'series_name', 'player_team', 'opposition_team']

label_encoder = LabelEncoder()

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [13]:
df['start_date'] = pd.to_datetime(df['start_date'])
df = df.sort_values(by='start_date')
df.reset_index(inplace=True)

In [15]:
df.drop(columns=['index'], inplace=True)

In [17]:
def calc_ema(df, span, cols=['run_out_direct','run_out_throw', 'sixes_scored','fours_scored','runs_scored','balls_faced',
                             'stumpings_done','runs_conceded','balls_bowled','wickets_taken','maidens','dot_balls_as_bowler']):
  df_ema = df.copy()
  for col in cols:
    if col in df_ema.columns:
      df_ema[f'{col}_ema'] = df_ema.groupby('player_id')[col].transform(lambda x: x.ewm(span=span, adjust=False).mean())
      df_ema.drop(col,axis=1, inplace=True)
    else:
        print(f"Warning: Column '{col}' not found in DataFrame.")
  return df_ema

In [19]:
df_ema = calc_ema(df,5)

In [27]:
X = df_ema.drop(columns=['fantasy_score_batting','fantasy_score_bowling','fantasy_score_total'])
y = df['fantasy_score_total']

In [29]:
def train_random_forest(X,y):
  from sklearn.ensemble import RandomForestRegressor
  from sklearn.compose import ColumnTransformer
  from sklearn.pipeline import Pipeline
  from sklearn.model_selection import train_test_split, cross_val_score
  from sklearn.metrics import mean_squared_error, r2_score
  from sklearn.preprocessing import StandardScaler

  # numeric_cols = X.select_dtypes(include=['number']).columns

  # numeric_cols=['run_out_direct','run_out_throw','sixes_scored','fours_scored','runs_scored','balls_faced',
  #                            'stumpings_done','runs_conceded','balls_bowled','wickets_taken','maidens','dot_balls_as_bowler']
  # for col in numeric_cols:
  #   cols = [f'{col}_ema']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

  # ## pipeline
  # preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), cols)])

  # model = Pipeline(steps=[('preprocessor', preprocessor),
  #                         ('regressor', RandomForestRegressor(n_estimators=50, random_state=1))])
  model = RandomForestRegressor(n_estimators=50, random_state=1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)


  ## evaluating the model
  mse = mean_squared_error(y_test, y_pred)
  r2 = r2_score(y_test, y_pred)
  
  print("-- Random Forest Evaluation Scores--")

  # scores = cross_val_score(model, X, y, cv=5, scoring = 'r2')
  print(f"Mean Squared Error: {mse:.4f}")
  print(f"R² Score: {r2:.4f}")

  return model

In [31]:
train_random_forest(X.drop('start_date', axis=1),y)

-- Random Forest Evaluation Scores--
Mean Squared Error: 484.1149
R² Score: 0.7430
