In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [17]:
df = pd.read_csv('synthetic_auction_training_data_with_market_value.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [18]:
df.columns

Index(['team', 'format', 'gender', 'no.', 'name', 'first', 'last', 'mat',
       'runs', 'hs', 'avg', '50', '100', 'balls', 'wkt', 'bbi', 'ave', '5wi',
       'ca', 'st', 'performance_score', 'normalized_score',
       'current_market_value'],
      dtype='object')

In [19]:
numeric_columns = ['runs', 'mat', 'hs', 'avg', '50', '100', 'balls', 'wkt', 'bbi', 'ave', '5wi', 'ca', 'st']

for col in numeric_columns:
    df[col] = df[col].astype(str).str.replace(',', '').str.strip()
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=numeric_columns)
print(df.dtypes)



team                     object
format                   object
gender                   object
no.                      object
name                     object
first                    object
last                     object
mat                     float64
runs                    float64
hs                      float64
avg                     float64
50                      float64
100                     float64
balls                   float64
wkt                     float64
bbi                     float64
ave                     float64
5wi                     float64
ca                      float64
st                      float64
performance_score       float64
normalized_score        float64
current_market_value    float64
dtype: object


In [20]:
df=df.drop(["name","team","gender","format","first","last","no."], axis=1)
X = df.drop("current_market_value", axis=1)
y = df["current_market_value"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
rfr = RandomForestRegressor(n_estimators=100, random_state = 0)

In [22]:
rfr.fit(X_train, y_train)
rfr.score(X_train, y_train)

0.9999404503537288

In [23]:
y_pred = rfr.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("R2 score:", r2)
print("RMSE:", rmse)

R2 score: 0.9998465371478377
RMSE: 11696.952650960655


In [24]:
import joblib
joblib.dump(rfr, 'rf_Cric.pkl')
# Assuming 'X_train' is your preprocessed training DataFrame
feature_names = X_train.columns.tolist()
joblib.dump(feature_names, 'cric_feature_names.pkl')
feature_names

['mat',
 'runs',
 'hs',
 'avg',
 '50',
 '100',
 'balls',
 'wkt',
 'bbi',
 'ave',
 '5wi',
 'ca',
 'st',
 'performance_score',
 'normalized_score']