In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
import os

## Data Collection➡️

In [21]:
input_file = r"D:\PYTHON\Projects\Cricket-win-predictor\data\matches.csv"
df = pd.read_csv(input_file)

print(f"Original Data shape: {df.shape}")

Original Data shape: (756, 18)


In [22]:
df.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [23]:
columns_needed = ["Season", "team1", "team2", "toss_winner", "toss_decision", "venue", "winner"]
df = df[columns_needed]

In [24]:
df.head()

Unnamed: 0,Season,team1,team2,toss_winner,toss_decision,venue,winner
0,IPL-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad
1,IPL-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,Maharashtra Cricket Association Stadium,Rising Pune Supergiant
2,IPL-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Saurashtra Cricket Association Stadium,Kolkata Knight Riders
3,IPL-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,Holkar Cricket Stadium,Kings XI Punjab
4,IPL-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,M Chinnaswamy Stadium,Royal Challengers Bangalore


In [25]:
# Drop rows with missing values
df = df.dropna()

In [26]:
print(f"After Cleaning: {df.shape}")

After Cleaning: (752, 7)


In [27]:
df.head()

Unnamed: 0,Season,team1,team2,toss_winner,toss_decision,venue,winner
0,IPL-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad
1,IPL-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,Maharashtra Cricket Association Stadium,Rising Pune Supergiant
2,IPL-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Saurashtra Cricket Association Stadium,Kolkata Knight Riders
3,IPL-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,Holkar Cricket Stadium,Kings XI Punjab
4,IPL-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,M Chinnaswamy Stadium,Royal Challengers Bangalore


In [28]:
# Define home venue mapping
home_venues = {
    'Mumbai Indians': 'Wankhede Stadium',
    'Chennai Super Kings': 'MA Chidambaram Stadium, Chepauk',
    'Royal Challengers Bangalore': 'M Chinnaswamy Stadium',
    'Kolkata Knight Riders': 'Eden Gardens',
    'Delhi Capitals': 'Arun Jaitley Stadium',
    'Sunrisers Hyderabad': 'Rajiv Gandhi International Stadium, Uppal',
    'Rajasthan Royals': 'Sawai Mansingh Stadium',
    'Punjab Kings': 'Punjab Cricket Association IS Bindra Stadium, Mohali',
    'Gujarat Titans': 'Narendra Modi Stadium',
    'Lucknow Super Giants': 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium',
    # Old IPL teams
    'Rising Pune Supergiant': 'Maharashtra Cricket Association Stadium',
    'Gujarat Lions': 'Saurashtra Cricket Association Stadium'
}

# Reverse map: venue → team
venue_to_team = {v: k for k, v in home_venues.items()}

# Calculate home advantage
def calc_home_advantage(row):
    team1 = row['team1']
    venue = row['venue']
    home_team = venue_to_team.get(venue)
    return 1 if home_team == team1 else 0

df['Home_advantage'] = df.apply(calc_home_advantage, axis=1)
print(df['Home_advantage'].value_counts())
df.head()


Home_advantage
0    562
1    190
Name: count, dtype: int64


Unnamed: 0,Season,team1,team2,toss_winner,toss_decision,venue,winner,Home_advantage
0,IPL-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,1
1,IPL-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,0
2,IPL-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Saurashtra Cricket Association Stadium,Kolkata Knight Riders,1
3,IPL-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,Holkar Cricket Stadium,Kings XI Punjab,0
4,IPL-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,M Chinnaswamy Stadium,Royal Challengers Bangalore,1


In [None]:
categorical_cols = ['team1', 'team2', 'toss_winner', 'toss_decision', 'venue', 'winner', 'Season']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

print("Categorical columns encoded successfully.")
df.head()


Categorical columns encoded successfully.


Unnamed: 0,Season,team1,team2,toss_winner,toss_decision,venue,winner,Home_advantage
0,9,14,13,13,1,28,14,1
1,9,8,11,11,1,21,11,0
2,9,4,7,7,1,31,7,1
3,9,11,5,5,1,13,5,0
4,9,13,3,13,0,17,13,1


### Save Processed Data and Encoders📂

In [37]:
# Ensure directories exist
project_root = os.path.dirname(os.path.dirname(input_file))
data_dir = os.path.join(project_root, 'data')
out_dir = os.path.join(project_root, 'out')
os.makedirs(data_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)

# Save processed data
output_file = os.path.join(data_dir, 'processed.csv')
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

# Save encoders
encoders_path = os.path.join(out_dir, 'encoders.pkl')
joblib.dump(encoders, encoders_path)
print(f"Encoders saved to {encoders_path}")


Processed data saved to D:\PYTHON\Projects\Cricket-win-predictor\data\processed.csv
Encoders saved to D:\PYTHON\Projects\Cricket-win-predictor\out\encoders.pkl
