In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [87]:
df=pd.read_csv("data/odb.csv")

In [88]:
df.head()

Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21368,86.23,49,96,20,2016,195
1,1,V Kohli (INDIA),2008-2021,254,245,39,12169,183,59.07,13061,93.17,43,62,13,1140,125
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,1231,162
3,3,RG Sharma (INDIA),2007-2021,227,220,32,9205,264,48.96,10354,88.9,29,43,13,832,244
4,4,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,1500,270


In [89]:
df.dtypes

Unnamed: 0      int64
Player         object
Span           object
Mat             int64
Inns            int64
NO              int64
Runs            int64
HS             object
Ave           float64
BF              int64
SR            float64
100             int64
50              int64
0               int64
4s             object
6s             object
dtype: object

In [90]:
df.isnull().sum()


Unnamed: 0    0
Player        0
Span          0
Mat           0
Inns          0
NO            0
Runs          0
HS            0
Ave           0
BF            0
SR            0
100           0
50            0
0             0
4s            0
6s            0
dtype: int64

In [91]:
df.drop(['Span','NO','Unnamed: 0','Player'],axis=1,inplace=True)

In [92]:
df.columns

Index(['Mat', 'Inns', 'Runs', 'HS', 'Ave', 'BF', 'SR', '100', '50', '0', '4s',
       '6s'],
      dtype='object')

In [93]:
df['HS'] = df['HS'].replace(r"\*", "", regex=True)

In [94]:
df.dtypes

Mat       int64
Inns      int64
Runs      int64
HS       object
Ave     float64
BF        int64
SR      float64
100       int64
50        int64
0         int64
4s       object
6s       object
dtype: object

In [95]:
df['HS'] = pd.to_numeric(df['HS'], errors='coerce')
# Clean the 'HS' column in the EDA phase by removing non-numeric characters and converting to float
df['HS'] = df['HS'].replace(to_replace=r'\D', value='', regex=True).astype(float)

In [96]:
# Clean the 'HS' column in the data by removing non-numeric characters and converting to float
df['HS'] = df['HS'].replace(to_replace=r'\D', value='', regex=True)  # Remove non-numeric characters
df['HS'] = pd.to_numeric(df['HS'], errors='coerce')  # Convert to numeric, invalid parsing will be NaN

In [97]:
df['4s'] = pd.to_numeric(df['4s'], errors='coerce')
df['6s'] = pd.to_numeric(df['6s'], errors='coerce')

In [98]:
df.dtypes

Mat       int64
Inns      int64
Runs      int64
HS      float64
Ave     float64
BF        int64
SR      float64
100       int64
50        int64
0         int64
4s      float64
6s      float64
dtype: object

In [None]:
df.head()

Unnamed: 0,Mat,Inns,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
0,463,452,18426,200.0,44.83,21368,86.23,49,96,20,2016.0,195.0
1,254,245,12169,183.0,59.07,13061,93.17,43,62,13,1140.0,125.0
2,375,365,13704,164.0,42.03,17046,80.39,30,82,20,1231.0,162.0
3,227,220,9205,264.0,48.96,10354,88.9,29,43,13,832.0,244.0
4,445,433,13430,189.0,32.36,14725,91.2,28,68,34,1500.0,270.0


In [100]:
df.isnull().sum()

Mat      0
Inns     0
Runs     0
HS       0
Ave      0
BF       0
SR       0
100      0
50       0
0        0
4s      17
6s      16
dtype: int64

In [107]:
df['4s']=df['4s'].fillna(0)
df['6s']=df['6s'].fillna(0)

In [108]:
df.isnull().sum()

Mat     0
Inns    0
Runs    0
HS      0
Ave     0
BF      0
SR      0
100     0
50      0
0       0
4s      0
6s      0
dtype: int64

In [111]:
df['Runs']

0      18426
1      12169
2      13704
3       9205
4      13430
       ...  
114     5615
115     5658
116     6248
117     6083
118     7170
Name: Runs, Length: 119, dtype: int64

In [112]:
df['PPS'] = (df['Runs'] * 0.4) + (df['Ave'] * 0.3) + (df['SR'] * 0.2) + (df['100'] * 5) + (df['50'] * 2.5) - (df['0'] * 2)

In [113]:
# Feature 1: Boundary Runs
df['Boundary_Runs'] = (df['4s'] * 4) + (df['6s'] * 6)

# Feature 2: Dot Ball Percentage (approximate)
df['Dot_Ball_Percentage'] = 1 - (df['Runs'] / df['BF'])
df['Dot_Ball_Percentage'] = df['Dot_Ball_Percentage'].clip(lower=0)  # avoid negatives

# Feature 3: Consistency = Average / Matches
df['Consistency'] = df['Ave'] / df['Mat']
df['Consistency'] = df['Consistency'].replace([float('inf'), -float('inf')], 0)


In [119]:
from sklearn.model_selection import train_test_split

X = df[['Mat', 'Inns', 'Runs', 'HS', 'Ave', 'BF', 'SR', '100', '50', '0', '4s','6s','Boundary_Runs', 'Dot_Ball_Percentage', 'Consistency']] # Or just drop Player if not using it
y = df['PPS']  # Custom target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [120]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled= scaler.fit_transform(X_train)
X_test_scaled= scaler.transform(X_test)

In [121]:
X_train_scaled

array([[-0.48513693, -0.44673253, -0.32205039, ..., -0.81711702,
        -0.54988686, -0.01634236],
       [ 0.66563904,  0.73012288,  0.66876153, ...,  0.99879425,
        -0.35406682, -0.67076583],
       [ 2.94440334,  2.88971322,  2.24934246, ...,  0.14809708,
         0.06880313, -1.04578018],
       ...,
       [ 0.09594797, -0.22834699, -0.37408884, ...,  0.58980522,
        -1.25688794, -0.38069349],
       [-1.2143415 , -1.23534698, -0.8906571 , ..., -0.24453238,
         0.48064752,  1.92016532],
       [-0.99785889, -1.0412265 , -1.15952238, ..., -0.50628536,
        -0.3993759 ,  0.31863417]], shape=(95, 15))

In [122]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
X_test['Predicted_PPS'] = predictions
top_11 = X_test.sort_values(by='Predicted_PPS', ascending=False).head(11)

In [123]:
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [126]:
import pickle
pickle.dump(scaler,open("scaler.pkl","wb"))