In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
import warnings

In [3]:
# Ignore warnings for clean output
warnings.filterwarnings("ignore")

In [4]:
# Load datasets
matches = pd.read_csv("/kaggle/input/ipl-dataset/matches.csv")
deliveries = pd.read_csv("/kaggle/input/ipl-dataset/deliveries.csv")

In [None]:
# Display first few rows of matches datasets
print("Matches Dataset:")
display(matches.head())

In [None]:
# Display first few rows of deliveries datasets
print("Deliveries Dataset:")
display(deliveries.head())

In [None]:
# Check for missing values
print("\nMissing values in Matches dataset:")
print(matches.isnull().sum())

In [None]:
print("\nMissing values in Deliveries dataset:")
print(deliveries.isnull().sum())

In [5]:
# Drop unnecessary columns 
matches.drop(columns=['umpire1', 'umpire2', 'method'], inplace=True)

In [6]:
matches['city'].fillna('Unknown',inplace=True)

In [7]:
matches.dropna(subset=['player_of_match'], inplace=True)

In [8]:
# Standardize team names
team_name_changes = {
    "Delhi Daredevils": "Delhi Capitals",
    "Deccan Chargers": "Sunrisers Hyderabad",
    "Gujarat Lions": "Gujarat Titans",
    "Kings XI Punjab": "Punjab Kings",
    "Royal Challengers Bangalore": "Royal Challengers Bengaluru",
    "Rising Pune Supergiant": "Rising Pune Supergiants"
    
}

In [9]:
matches.replace({"team1": team_name_changes, "team2": team_name_changes, 
                 "winner": team_name_changes, "toss_winner": team_name_changes}, inplace=True)

In [10]:
deliveries.replace({"batting_team": team_name_changes, "bowling_team": team_name_changes}, inplace=True)

In [11]:
# Define teams to remove
teams_to_remove = ["Rising Pune Supergiants", "Kochi Tuskers Kerala", "Pune Warriors"]

# Filter out matches where either team1 or team2 is in teams_to_remove
matches = matches[~matches['team1'].isin(teams_to_remove) & ~matches['team2'].isin(teams_to_remove)]

# Reset index
matches.reset_index(drop=True, inplace=True)

print("Updated dataset shape:", matches.shape)

Updated dataset shape: (1002, 17)


In [12]:
unique_teams = set(matches['team1']).union(set(matches['team2']))
print("Unique Teams:", unique_teams)

Unique Teams: {'Lucknow Super Giants', 'Gujarat Titans', 'Rajasthan Royals', 'Delhi Capitals', 'Sunrisers Hyderabad', 'Chennai Super Kings', 'Mumbai Indians', 'Kolkata Knight Riders', 'Punjab Kings', 'Royal Challengers Bengaluru'}


In [13]:
# Remove matches that had super overs 
matches = matches[matches['result'] != "tie"]

In [17]:
# Check for missing values
print("\nMissing values in Matches dataset:")
print(matches.isnull().sum())


Missing values in Matches dataset:
id                 0
season             0
city               0
date               0
match_type         0
player_of_match    0
venue              0
team1              0
team2              0
toss_winner        0
toss_decision      0
winner             0
result             0
result_margin      0
target_runs        0
target_overs       0
super_over         0
dtype: int64


In [16]:
# Convert date to datetime format for time-based analysis
matches['date'] = pd.to_datetime(matches['date'])

In [None]:
# Save cleaned datasets
matches.to_csv("cleaned_matches.csv", index=False)
deliveries.to_csv("cleaned_deliveries.csv", index=False)

In [18]:
# Compute total matches played by each team
matches_played = pd.concat([matches[['team1', 'winner']], matches[['team2', 'winner']].rename(columns={'team2': 'team1'})])
team_total_matches = matches_played.groupby('team1').size()

# Compute total wins by each team
team_wins = matches_played.groupby('team1')['winner'].apply(lambda x: (x == x.name).sum())

# Compute win percentage for each team
win_percentage = (team_wins / team_total_matches).fillna(0)

# Map win percentage back to matches dataset
matches['team1_win_percentage'] = matches['team1'].map(win_percentage)
matches['team2_win_percentage'] = matches['team2'].map(win_percentage)

In [19]:
# Define home cities for each team
team_home_cities = {
    'Mumbai Indians': ['Mumbai', 'Navi Mumbai'],
    'Chennai Super Kings': ['Chennai'],
    'Kolkata Knight Riders': ['Kolkata'],
    'Royal Challengers Bengaluru': ['Bangalore', 'Bengaluru'],
    'Sunrisers Hyderabad': ['Hyderabad'],
    'Delhi Capitals': ['Delhi'],
    'Punjab Kings': ['Chandigarh', 'Mohali'],
    'Rajasthan Royals': ['Jaipur'],
    'Gujarat Titans': ['Ahmedabad'],
    'Lucknow Super Giants': ['Lucknow'],
}

# Add additional cities where teams have played home games occasionally
extra_home_grounds = {
    'Punjab Kings': ['Dharamsala'],
    'Delhi Capitals': ['Raipur'],
    'Kolkata Knight Riders': ['Ranchi'],
    'Rajasthan Royals': ['Ahmedabad'],
    'Sunrisers Hyderabad': ['Visakhapatnam']
}

# Merge additional home grounds
for team, cities in extra_home_grounds.items():
    team_home_cities[team].extend(cities)

print(team_home_cities)


{'Mumbai Indians': ['Mumbai', 'Navi Mumbai'], 'Chennai Super Kings': ['Chennai'], 'Kolkata Knight Riders': ['Kolkata', 'Ranchi'], 'Royal Challengers Bengaluru': ['Bangalore', 'Bengaluru'], 'Sunrisers Hyderabad': ['Hyderabad', 'Visakhapatnam'], 'Delhi Capitals': ['Delhi', 'Raipur'], 'Punjab Kings': ['Chandigarh', 'Mohali', 'Dharamsala'], 'Rajasthan Royals': ['Jaipur', 'Ahmedabad'], 'Gujarat Titans': ['Ahmedabad'], 'Lucknow Super Giants': ['Lucknow']}


In [20]:
# Function to check if the city is a home city for team1
def is_home_team(team, city):
    return int(city in team_home_cities.get(team, []))

# Apply the function to create the isTeam1Home column
matches['isTeam1Home'] = matches.apply(lambda x: is_home_team(x['team1'], x['city']), axis=1)

In [21]:
unique_teams = set(matches['team1']).union(set(matches['team2']))
print("Unique Teams:", unique_teams)


Unique Teams: {'Lucknow Super Giants', 'Gujarat Titans', 'Rajasthan Royals', 'Delhi Capitals', 'Sunrisers Hyderabad', 'Chennai Super Kings', 'Mumbai Indians', 'Kolkata Knight Riders', 'Punjab Kings', 'Royal Challengers Bengaluru'}


In [None]:
matches.head()

In [None]:
# Count total matches played at each venue by each team
venue_matches = pd.concat([
    matches[['team1', 'venue']].rename(columns={'team1': 'team'}),
    matches[['team2', 'venue']].rename(columns={'team2': 'team'})
]).groupby(['team', 'venue']).size()

# Count total matches played by each team
total_matches = pd.concat([
    matches[['team1']].rename(columns={'team1': 'team'}),
    matches[['team2']].rename(columns={'team2': 'team'})
]).groupby('team').size()

# Compute home advantage as a simple dictionary
home_advantage = (venue_matches / total_matches).fillna(0).to_dict()

# Define a function to fetch home advantage safely
def get_home_advantage(team, venue):
    return home_advantage.get((team, venue), 0)

# Apply the function to both team1 and team2
matches['team1_home_advantage'] = matches.apply(lambda x: get_home_advantage(x['team1'], x['venue']), axis=1)
matches['team2_home_advantage'] = matches.apply(lambda x: get_home_advantage(x['team2'], x['venue']), axis=1)

In [None]:
# Powerplay & Death Overs Performance
powerplay = deliveries[(deliveries['over'] >= 1) & (deliveries['over'] <= 6)]
death_overs = deliveries[(deliveries['over'] >= 16) & (deliveries['over'] <= 20)]

In [None]:
def team_phase_performance(df, phase_name):
    phase_runs = df.groupby('batting_team')['total_runs'].sum().rename(f'{phase_name}_runs')
    return phase_runs

powerplay_runs = team_phase_performance(powerplay, 'powerplay')
death_runs = team_phase_performance(death_overs, 'death')

# Merge into matches ensuring correct alignment
matches = matches.merge(powerplay_runs, left_on='team1', right_index=True, how='left')
matches = matches.merge(death_runs, left_on='team1', right_index=True, how='left')

# Fill missing values with 0 (if some teams had no matches in that phase)
matches['powerplay_runs'].fillna(0, inplace=True)
matches['death_runs'].fillna(0, inplace=True)

In [22]:
# Venue Impact Feature
venue_win_rate = matches.groupby(['venue', 'winner']).size().unstack(fill_value=0)

# Normalize win rates per venue
venue_win_rate = venue_win_rate.div(venue_win_rate.sum(axis=1), axis=0)

# Convert to long format for merging
venue_win_rate = venue_win_rate.reset_index().melt(id_vars=['venue'], var_name='venue_team', value_name='win_rate')

# Merge venue win rate but avoid duplicate columns
matches = matches.merge(venue_win_rate, left_on=['venue', 'team1'], right_on=['venue', 'venue_team'], how='left')

# Drop the extra 'venue_team' column
matches.drop(columns=['venue_team'], inplace=True)

In [23]:
matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,...,winner,result,result_margin,target_runs,target_overs,super_over,team1_win_percentage,team2_win_percentage,isTeam1Home,win_rate
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bengaluru,Kolkata Knight Riders,Royal Challengers Bengaluru,...,Kolkata Knight Riders,runs,140.0,223.0,20.0,N,0.470588,0.521186,1,0.410714
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Punjab Kings,Chennai Super Kings,Chennai Super Kings,...,Chennai Super Kings,runs,33.0,241.0,20.0,N,0.445887,0.583333,1,0.5
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Capitals,Rajasthan Royals,Rajasthan Royals,...,Delhi Capitals,wickets,9.0,130.0,20.0,N,0.451064,0.502392,1,0.4
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bengaluru,Mumbai Indians,...,Royal Challengers Bengaluru,wickets,5.0,166.0,20.0,N,0.553279,0.470588,1,0.6
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,...,Kolkata Knight Riders,wickets,5.0,111.0,20.0,N,0.521186,0.450413,1,0.589041


In [24]:
### Feature 2: Last 5 Matches Win Percentage
match_results = matches[['team1', 'team2', 'winner', 'date']]
match_results['date'] = pd.to_datetime(match_results['date'])

In [25]:
# Function to calculate last 5 matches win percentage
def calculate_win_percentage(team, date):
    recent_matches = match_results[(match_results['date'] < date) & ((match_results['team1'] == team) | (match_results['team2'] == team))].tail(5)
    wins = (recent_matches['winner'] == team).sum()
    return wins / 5 if len(recent_matches) > 0 else 0

matches['team1_last5_win_pct'] = matches.apply(lambda row: calculate_win_percentage(row['team1'], row['date']), axis=1)
matches['team2_last5_win_pct'] = matches.apply(lambda row: calculate_win_percentage(row['team2'], row['date']), axis=1)


In [None]:
# Powerplay & Death Overs DataFrames
powerplay = deliveries[(deliveries['over'] >= 1) & (deliveries['over'] <= 6)]
death_overs = deliveries[(deliveries['over'] >= 16) & (deliveries['over'] <= 20)]

# Function to compute average runs per match & team
def team_phase_avg(df, phase_name):
    phase_avg = df.groupby(['match_id', 'batting_team'])['total_runs'].mean().rename(f'{phase_name}_avg').reset_index()
    return phase_avg

# Compute Average Powerplay & Death Over Runs
powerplay_avg = team_phase_avg(powerplay, 'powerplay')
death_avg = team_phase_avg(death_overs, 'death')

# Rename before merging to prevent column conflicts
powerplay_avg_team1 = powerplay_avg.rename(columns={'batting_team': 'team1', 'powerplay_avg': 'team1_powerplay_avg'})
powerplay_avg_team2 = powerplay_avg.rename(columns={'batting_team': 'team2', 'powerplay_avg': 'team2_powerplay_avg'})

death_avg_team1 = death_avg.rename(columns={'batting_team': 'team1', 'death_avg': 'team1_death_avg'})
death_avg_team2 = death_avg.rename(columns={'batting_team': 'team2', 'death_avg': 'team2_death_avg'})

# Merge into matches dataset
matches = matches.merge(powerplay_avg_team1, on=['id', 'team1'], how='left')
matches = matches.merge(powerplay_avg_team2, on=['id', 'team2'], how='left')
matches = matches.merge(death_avg_team1, on=['id', 'team1'], how='left')
matches = matches.merge(death_avg_team2, on=['id', 'team2'], how='left')

# Fill missing values with 0
matches[['team1_powerplay_avg', 'team2_powerplay_avg', 'team1_death_avg', 'team2_death_avg']] = matches[
    ['team1_powerplay_avg', 'team2_powerplay_avg', 'team1_death_avg', 'team2_death_avg']].fillna(0)


In [26]:
# Toss Impact
matches['toss_win_bat'] = matches['toss_decision'].apply(lambda x: 1 if x == 'bat' else 0)

In [27]:
# Compute head-to-head win rate
head_to_head = matches.groupby(['team1', 'team2'])['winner'].apply(lambda x: (x == x.name).mean()).reset_index(name='head_to_head_win_rate')

# Merge with the matches dataframe
matches = matches.merge(head_to_head, on=['team1', 'team2'], how='left')

# Fill NaN values with 0.5 as default
matches['head_to_head_win_rate'].fillna(0.5, inplace=True)


In [28]:
print(matches.columns)

Index(['id', 'season', 'city', 'date', 'match_type', 'player_of_match',
       'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner',
       'result', 'result_margin', 'target_runs', 'target_overs', 'super_over',
       'team1_win_percentage', 'team2_win_percentage', 'isTeam1Home',
       'win_rate', 'team1_last5_win_pct', 'team2_last5_win_pct',
       'toss_win_bat', 'head_to_head_win_rate'],
      dtype='object')


In [29]:
def add_advanced_features(df):
    # Convert 'winner' to binary flags for both teams
    df['team1_won'] = (df['winner'] == df['team1']).astype(int)
    df['team2_won'] = (df['winner'] == df['team2']).astype(int)

    # Rolling Win Percentage Over Last 10 Matches
    df['team1_recent_form'] = df.groupby('team1')['team1_won'].transform(lambda x: x.rolling(10, min_periods=1).mean())
    df['team2_recent_form'] = df.groupby('team2')['team2_won'].transform(lambda x: x.rolling(10, min_periods=1).mean())

    # Win Streak (Last 5 Matches)
    df['team1_win_streak'] = df.groupby('team1')['team1_won'].transform(lambda x: x.rolling(5, min_periods=1).sum())
    df['team2_win_streak'] = df.groupby('team2')['team2_won'].transform(lambda x: x.rolling(5, min_periods=1).sum())

    # Toss Impact (Win Rate After Toss Win)
    df['team1_toss_win'] = (df['toss_winner'] == df['team1']).astype(int)
    df['team2_toss_win'] = (df['toss_winner'] == df['team2']).astype(int)

    df['team1_toss_win_success'] = df.groupby('team1')['team1_won'].transform(lambda x: x.rolling(10, min_periods=1).mean())
    df['team2_toss_win_success'] = df.groupby('team2')['team2_won'].transform(lambda x: x.rolling(10, min_periods=1).mean())

    # Net Run Rate (NRR) Difference - If NRR values exist in dataset
    if 'team1_nrr' in df.columns and 'team2_nrr' in df.columns:
        df['net_run_rate_diff'] = df['team1_nrr'] - df['team2_nrr']
    
    # Head-to-Head Record
    df['team1_h2h_wins'] = df.groupby(['team1', 'team2'])['team1_won'].cumsum() - df['team1_won']
    df['team2_h2h_wins'] = df.groupby(['team2', 'team1'])['team2_won'].cumsum() - df['team2_won']

    # Home Advantage (if home_team column exists)
    if 'home_team' in df.columns:
        df['team1_home_advantage'] = (df['home_team'] == df['team1']).astype(int)
        df['team2_home_advantage'] = (df['home_team'] == df['team2']).astype(int)
    
    return df

In [30]:
# Apply feature engineering
matches = add_advanced_features(matches)

In [31]:
print(matches.columns)

Index(['id', 'season', 'city', 'date', 'match_type', 'player_of_match',
       'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner',
       'result', 'result_margin', 'target_runs', 'target_overs', 'super_over',
       'team1_win_percentage', 'team2_win_percentage', 'isTeam1Home',
       'win_rate', 'team1_last5_win_pct', 'team2_last5_win_pct',
       'toss_win_bat', 'head_to_head_win_rate', 'team1_won', 'team2_won',
       'team1_recent_form', 'team2_recent_form', 'team1_win_streak',
       'team2_win_streak', 'team1_toss_win', 'team2_toss_win',
       'team1_toss_win_success', 'team2_toss_win_success', 'team1_h2h_wins',
       'team2_h2h_wins'],
      dtype='object')


In [32]:
# Select only the required columns
df_selected = matches[['id', 'season', 'city', 'date', 'match_type', 
                  'venue', 'team1', 'team2', 'toss_winner', 'winner']]

# Save the new dataset
df_selected.to_csv("filtered_matches.csv", index=False)

In [33]:
features = [
    'team1', 'team2', 'team1_win_percentage', 'team2_win_percentage', 'isTeam1Home', 
    'team1_last5_win_pct', 'team2_last5_win_pct', 'team1_recent_form', 'team2_recent_form', 
    'team1_win_streak','team2_win_streak', 'team1_toss_win', 'team2_toss_win',
    'team1_toss_win_success', 'team2_toss_win_success', 'team1_h2h_wins','team2_h2h_wins']

In [34]:
# Feature Engineering for Prediction
X = matches[features]
y = matches['winner']

In [None]:
X = pd.get_dummies(X, drop_first=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [35]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
expected_features = X_train.columns

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the objective function
def objective(trial):
    # Hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_int("max_depth", 5, 50, step=5)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # Train model with selected hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    # Return accuracy as the objective metric
    return accuracy_score(y_test, y_pred)

# Run Optuna hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the optimized model
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))


In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the objective function
def objective(trial):
    # Hyperparameters to tune
    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_int("max_depth", 5, 50, step=5)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    # Train model with selected hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    # Return accuracy as the objective metric
    return accuracy_score(y_test, y_pred)

# Run Optuna hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
best_rf = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the optimized model
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_best_rf))


In [38]:
import pandas as pd
import numpy as np
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Define features
features = [
    'team1', 'team2', 'team1_win_percentage', 'team2_win_percentage', 'isTeam1Home', 
    'team1_last5_win_pct', 'team2_last5_win_pct', 'team1_recent_form', 'team2_recent_form', 
    'team1_win_streak', 'team2_win_streak', 'team1_toss_win', 'team2_toss_win',
    'team1_toss_win_success', 'team2_toss_win_success', 'team1_h2h_wins', 'team2_h2h_wins'
]

# Feature Engineering for Prediction
X = matches[features].copy()
y = matches['winner']

# 1️⃣ Label Encode Categorical Variables (team1, team2)
label_encoder_teams = LabelEncoder()
X['team1'] = label_encoder_teams.fit_transform(X['team1'])
X['team2'] = label_encoder_teams.transform(X['team2'])

# 2️⃣ Standardize Numerical Features (Optional)
num_features = ['team1_win_percentage', 'team2_win_percentage', 'team1_recent_form', 'team2_recent_form']
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# 3️⃣ Encode Target Variable (Winner)
label_encoder_winner = LabelEncoder()
y_encoded = label_encoder_winner.fit_transform(y)

# 4️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 3, 20, step=1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 0.5, step=0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1, step=0.1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1, step=0.1)
    }

    # Train model with selected hyperparameters
    xgb = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    xgb.fit(X_train, y_train)

    # Predict on validation set
    y_pred = xgb.predict(X_test)
    
    return accuracy_score(y_test, y_pred)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final XGBoost model with best parameters
best_xgb = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
best_xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred_best_xgb = best_xgb.predict(X_test)
y_pred_best_xgb_decoded = label_encoder_winner.inverse_transform(y_pred_best_xgb)  # Convert back to team names

print("Optimized XGBoost Accuracy:", accuracy_score(y_test, y_pred_best_xgb))

[I 2025-03-23 16:23:08,407] A new study created in memory with name: no-name-ee921ca5-694c-4996-9865-7d4e0d38da4a
[I 2025-03-23 16:23:10,192] Trial 0 finished with value: 0.6919191919191919 and parameters: {'n_estimators': 500, 'learning_rate': 0.01, 'max_depth': 16, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_weight': 5, 'gamma': 0.2, 'reg_alpha': 0.1, 'reg_lambda': 0.9}. Best is trial 0 with value: 0.6919191919191919.
[I 2025-03-23 16:23:10,803] Trial 1 finished with value: 0.6161616161616161 and parameters: {'n_estimators': 400, 'learning_rate': 0.28, 'max_depth': 20, 'subsample': 0.8, 'colsample_bytree': 0.9, 'min_child_weight': 5, 'gamma': 0.30000000000000004, 'reg_alpha': 0.9, 'reg_lambda': 0.6000000000000001}. Best is trial 0 with value: 0.6919191919191919.
[I 2025-03-23 16:23:11,054] Trial 2 finished with value: 0.6313131313131313 and parameters: {'n_estimators': 100, 'learning_rate': 0.18000000000000002, 'max_depth': 8, 'subsample': 0.5, 'colsample_bytree': 0.7, 'min

Best Hyperparameters: {'n_estimators': 450, 'learning_rate': 0.13, 'max_depth': 20, 'subsample': 0.9, 'colsample_bytree': 1.0, 'min_child_weight': 3, 'gamma': 0.5, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.9}
Optimized XGBoost Accuracy: 0.7070707070707071


In [39]:
print("Best model parameters:", best_xgb.get_params())
print("Number of trees:", best_xgb.n_estimators)
print("Feature importance:", best_xgb.feature_importances_)

Best model parameters: {'objective': 'multi:softprob', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 1.0, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': 'logloss', 'feature_types': None, 'gamma': 0.5, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.13, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 20, 'max_leaves': None, 'min_child_weight': 3, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 450, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 0.30000000000000004, 'reg_lambda': 0.9, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.9, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'use_label_encoder': False}
Number of trees: 450
Feature importance: [

In [None]:
print(type(best_xgb))

In [None]:
from sklearn.exceptions import NotFittedError

try:
    print(best_xgb.get_booster())
except NotFittedError:
    print("Model is not fitted yet.")

In [None]:
import joblib

# Save the trained model
joblib.dump(best_xgb, "best_xgb_model.pkl")
print("Model saved successfully!")

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 3, 20, step=1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 0.5, step=0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1, step=0.1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1, step=0.1)
    }

    # Train model with selected hyperparameters
    xgb = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric="logloss")
    xgb.fit(X_train, y_train_encoded)

    # Predict on validation set
    y_pred = xgb.predict(X_test)
    
    return accuracy_score(y_test_encoded, y_pred)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final XGBoost model with best parameters
best_xgb = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric="logloss")
best_xgb.fit(X_train, y_train_encoded)

# Predict and evaluate
y_pred_best_xgb = best_xgb.predict(X_test)
y_pred_best_xgb_decoded = label_encoder.inverse_transform(y_pred_best_xgb)  # Convert back to team names

print("Optimized XGBoost Accuracy:", accuracy_score(y_test_encoded, y_pred_best_xgb))


In [None]:
import optuna
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Random Forest parameters
    rf_params = {
        "n_estimators": trial.suggest_int("rf_n_estimators", 50, 500, step=50),
        "max_depth": trial.suggest_int("rf_max_depth", 5, 30, step=1),
        "min_samples_split": trial.suggest_int("rf_min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("rf_min_samples_leaf", 1, 10),
        "bootstrap": trial.suggest_categorical("rf_bootstrap", [True, False]),
    }

    # XGBoost parameters
    xgb_params = {
        "n_estimators": trial.suggest_int("xgb_n_estimators", 50, 500, step=50),
        "learning_rate": trial.suggest_float("xgb_learning_rate", 0.01, 0.3, step=0.01),
        "max_depth": trial.suggest_int("xgb_max_depth", 3, 20, step=1),
        "subsample": trial.suggest_float("xgb_subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0, step=0.1),
        "min_child_weight": trial.suggest_int("xgb_min_child_weight", 1, 10),
        "gamma": trial.suggest_float("xgb_gamma", 0, 0.5, step=0.1),
        "reg_alpha": trial.suggest_float("xgb_reg_alpha", 0, 1, step=0.1),
        "reg_lambda": trial.suggest_float("xgb_reg_lambda", 0, 1, step=0.1),
    }

    # Train models with selected hyperparameters
    rf_model = RandomForestClassifier(**rf_params, random_state=42)
    xgb_model = XGBClassifier(**xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")

    # Ensemble Voting Classifier
    ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model)], voting='hard')
    
    # Train the ensemble model
    ensemble_model.fit(X_train, y_train_encoded)

    # Predict on validation set
    y_pred = ensemble_model.predict(X_test)
    
    return accuracy_score(y_test_encoded, y_pred)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final models with best parameters
best_rf = RandomForestClassifier(
    n_estimators=best_params["rf_n_estimators"],
    max_depth=best_params["rf_max_depth"],
    min_samples_split=best_params["rf_min_samples_split"],
    min_samples_leaf=best_params["rf_min_samples_leaf"],
    bootstrap=best_params["rf_bootstrap"],
    random_state=42
)

best_xgb = XGBClassifier(
    n_estimators=best_params["xgb_n_estimators"],
    learning_rate=best_params["xgb_learning_rate"],
    max_depth=best_params["xgb_max_depth"],
    subsample=best_params["xgb_subsample"],
    colsample_bytree=best_params["xgb_colsample_bytree"],
    min_child_weight=best_params["xgb_min_child_weight"],
    gamma=best_params["xgb_gamma"],
    reg_alpha=best_params["xgb_reg_alpha"],
    reg_lambda=best_params["xgb_reg_lambda"],
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

# Final optimized ensemble model
final_ensemble = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='hard')
final_ensemble.fit(X_train, y_train_encoded)

# Predict and evaluate
y_pred_final = final_ensemble.predict(X_test)
print("Optimized Ensemble Model Accuracy:", accuracy_score(y_test_encoded, y_pred_final))


In [None]:
import optuna
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Random Forest parameters
    rf_params = {
        "n_estimators": trial.suggest_int("rf_n_estimators", 50, 500, step=50),
        "max_depth": trial.suggest_int("rf_max_depth", 5, 30, step=1),
        "min_samples_split": trial.suggest_int("rf_min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("rf_min_samples_leaf", 1, 10),
        "bootstrap": trial.suggest_categorical("rf_bootstrap", [True, False]),
    }

    # XGBoost parameters
    xgb_params = {
        "n_estimators": trial.suggest_int("xgb_n_estimators", 50, 500, step=50),
        "learning_rate": trial.suggest_float("xgb_learning_rate", 0.01, 0.3, step=0.01),
        "max_depth": trial.suggest_int("xgb_max_depth", 3, 20, step=1),
        "subsample": trial.suggest_float("xgb_subsample", 0.5, 1.0, step=0.1),
        "colsample_bytree": trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0, step=0.1),
        "min_child_weight": trial.suggest_int("xgb_min_child_weight", 1, 10),
        "gamma": trial.suggest_float("xgb_gamma", 0, 0.5, step=0.1),
        "reg_alpha": trial.suggest_float("xgb_reg_alpha", 0, 1, step=0.1),
        "reg_lambda": trial.suggest_float("xgb_reg_lambda", 0, 1, step=0.1),
    }

    # Train models with selected hyperparameters
    rf_model = RandomForestClassifier(**rf_params, random_state=42)
    xgb_model = XGBClassifier(**xgb_params, random_state=42, use_label_encoder=False, eval_metric="logloss")

    # Ensemble Voting Classifier
    ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model)], voting='hard')
    
    # Train the ensemble model
    ensemble_model.fit(X_train, y_train_encoded)

    # Predict on validation set
    y_pred = ensemble_model.predict(X_test)
    
    return accuracy_score(y_test_encoded, y_pred)

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final models with best parameters
best_rf = RandomForestClassifier(
    n_estimators=best_params["rf_n_estimators"],
    max_depth=best_params["rf_max_depth"],
    min_samples_split=best_params["rf_min_samples_split"],
    min_samples_leaf=best_params["rf_min_samples_leaf"],
    bootstrap=best_params["rf_bootstrap"],
    random_state=42
)

best_xgb = XGBClassifier(
    n_estimators=best_params["xgb_n_estimators"],
    learning_rate=best_params["xgb_learning_rate"],
    max_depth=best_params["xgb_max_depth"],
    subsample=best_params["xgb_subsample"],
    colsample_bytree=best_params["xgb_colsample_bytree"],
    min_child_weight=best_params["xgb_min_child_weight"],
    gamma=best_params["xgb_gamma"],
    reg_alpha=best_params["xgb_reg_alpha"],
    reg_lambda=best_params["xgb_reg_lambda"],
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

# Final optimized ensemble model
final_ensemble = VotingClassifier(estimators=[('rf', best_rf), ('xgb', best_xgb)], voting='hard')
final_ensemble.fit(X_train, y_train_encoded)

# Predict and evaluate
y_pred_final = final_ensemble.predict(X_test)
print("Optimized Ensemble Model Accuracy:", accuracy_score(y_test_encoded, y_pred_final))


In [None]:
# print("Random Forest Report:")
# print(classification_report(y_test, y_pred_best_rf))
print("\nXGBoost Report:")
print(classification_report(y_test_encoded, y_pred_best_xgb))
print("\nEnsemble Model Report:")
print(classification_report(y_test_encoded, y_pred_final))

In [None]:
import joblib

# Save the trained model
joblib.dump(best_xgb, "best_xgb_model.pkl")
print("Model saved successfully as best_xgb_model.pkl")

In [104]:
# Load historical matches dataset (for feature computation)
matches1 = pd.read_csv("/kaggle/input/jay-shah-supremacy/filtered_matches (1).csv")

In [105]:
# Load upcoming matches schedule
schedule = pd.read_csv("/kaggle/input/jay-shah-supremacy/ipl_schedule_2025.csv")

In [106]:
schedule['match_type'] = 'league'

In [107]:
# Apply the function to create the isTeam1Home column
schedule['isTeam1Home'] = matches.apply(lambda x: is_home_team(x['team1'], x['city']), axis=1)

In [108]:
schedule.head()

Unnamed: 0,id,season,city,date,time,venue,team1,team2,match_type,isTeam1Home
0,1,2025,Kolkata,2025-03-22,19:30,Eden Gardens,Kolkata Knight Riders,Royal Challengers Bengaluru,league,1
1,2,2025,Hyderabad,2025-03-23,15:30,Rajiv Gandhi International Stadium,Sunrisers Hyderabad,Rajasthan Royals,league,1
2,3,2025,Chennai,2025-03-23,19:30,M.A. Chidambaram Stadium,Chennai Super Kings,Mumbai Indians,league,1
3,4,2025,Visakhapatnam,2025-03-24,19:30,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,Delhi Capitals,Lucknow Super Giants,league,1
4,5,2025,Ahmedabad,2025-03-25,19:30,Narendra Modi Stadium,Gujarat Titans,Punjab Kings,league,1


In [109]:
def compute_features(match, matches):
    """
    Compute required input features for a single match.
    `match`: A single match row.
    `matches`: The historical dataset (which is updated after each prediction).
    """
    team1 = match['team1']
    team2 = match['team2']
    isTeam1Home = match['isTeam1Home']
    
    # Compute total matches played by each team
    matches_played = pd.concat([
        matches[['team1', 'winner']], 
        matches[['team2', 'winner']].rename(columns={'team2': 'team1'})
    ])
    team_total_matches = matches_played.groupby('team1').size()

    # Compute total wins by each team
    team_wins = matches_played.groupby('team1')['winner'].apply(lambda x: (x == x.name).sum())

    # Compute win percentage for each team
    win_percentage = (team_wins / team_total_matches).fillna(0)
    
    # Map win percentage back to match teams
    team1_win_percentage = win_percentage.get(team1, 0)
    team2_win_percentage = win_percentage.get(team2, 0)
    
    # Recent Form (Last 10 Matches Rolling Win Percentage)
    matches['team1_won'] = (matches['winner'] == matches['team1']).astype(int)
    matches['team2_won'] = (matches['winner'] == matches['team2']).astype(int)
    
    team1_recent_form = matches.groupby('team1')['team1_won'].transform(lambda x: x.rolling(10, min_periods=1).mean()).iloc[-1]
    team2_recent_form = matches.groupby('team2')['team2_won'].transform(lambda x: x.rolling(10, min_periods=1).mean()).iloc[-1]
    
    # Last 5 Matches Win Percentage
    last5_team1 = matches[(matches['team1'] == team1) | (matches['team2'] == team1)].tail(5)
    last5_team2 = matches[(matches['team1'] == team2) | (matches['team2'] == team2)].tail(5)
    
    team1_last5_win_pct = (last5_team1['winner'] == team1).mean()
    team2_last5_win_pct = (last5_team2['winner'] == team2).mean()
    
    # Win Streak (Last 5 Matches)
    team1_win_streak = last5_team1['team1_won'].sum()
    team2_win_streak = last5_team2['team2_won'].sum()
    
    # Toss Statistics
    team1_toss_win = (matches['toss_winner'] == team1).mean()
    team2_toss_win = (matches['toss_winner'] == team2).mean()

    team1_toss_win_success = matches.groupby('team1')['team1_won'].transform(lambda x: x.rolling(10, min_periods=1).mean()).iloc[-1]
    team2_toss_win_success = matches.groupby('team2')['team2_won'].transform(lambda x: x.rolling(10, min_periods=1).mean()).iloc[-1]
    
    # Head-to-Head Wins
    matches['team1_h2h_wins'] = matches.groupby(['team1', 'team2'])['team1_won'].cumsum() - matches['team1_won']
    matches['team2_h2h_wins'] = matches.groupby(['team2', 'team1'])['team2_won'].cumsum() - matches['team2_won']
    
    team1_h2h_wins = matches['team1_h2h_wins'].iloc[-1] if not matches.empty else 0
    team2_h2h_wins = matches['team2_h2h_wins'].iloc[-1] if not matches.empty else 0
    
    return {
        'team1': team1,
        'team2': team2,
        'team1_win_percentage': team1_win_percentage,
        'team2_win_percentage': team2_win_percentage,
        'isTeam1Home': isTeam1Home,
        'team1_last5_win_pct': team1_last5_win_pct,
        'team2_last5_win_pct': team2_last5_win_pct,
        'team1_recent_form': team1_recent_form,
        'team2_recent_form': team2_recent_form,
        'team1_win_streak': team1_win_streak,
        'team2_win_streak': team2_win_streak,
        'team1_toss_win': team1_toss_win,
        'team2_toss_win': team2_toss_win,
        'team1_toss_win_success': team1_toss_win_success,
        'team2_toss_win_success': team2_toss_win_success,
        'team1_h2h_wins': team1_h2h_wins,
        'team2_h2h_wins': team2_h2h_wins
    }

In [110]:
### Step 2: Sequentially Process Matches ###
predictions = []

In [111]:
# Ensure feature alignment with training data
train_feature_columns = X_train.columns.tolist()  # Get trained feature names

print(train_feature_columns)

['team1', 'team2', 'team1_win_percentage', 'team2_win_percentage', 'isTeam1Home', 'team1_last5_win_pct', 'team2_last5_win_pct', 'team1_recent_form', 'team2_recent_form', 'team1_win_streak', 'team2_win_streak', 'team1_toss_win', 'team2_toss_win', 'team1_toss_win_success', 'team2_toss_win_success', 'team1_h2h_wins', 'team2_h2h_wins']


In [112]:
import random
import pandas as pd
import numpy as np

def predict_matches(schedule, matches1, best_xgb, label_encoder_teams, scaler, train_feature_columns):
    predictions = []
    
    for idx, match in schedule.iterrows():
        print(f"Processing match {idx + 1}/{len(schedule)}: {match['team1']} vs {match['team2']}")
        
        # Compute features dynamically
        match_features = compute_features(match, matches1)
        
        # Convert to DataFrame
        match_df = pd.DataFrame([match_features])
        
        # Encode categorical features using the same LabelEncoder
        match_df['team1'] = label_encoder_teams.transform([match_df['team1'][0]])
        match_df['team2'] = label_encoder_teams.transform([match_df['team2'][0]])
        
        # Ensure all columns from training exist
        for col in train_feature_columns:
            if col not in match_df.columns:
                match_df[col] = 0  # Assign 0 to missing columns

        # Drop any extra columns that were not in training
        match_df = match_df[train_feature_columns]  # Reorder & retain only required columns
        
        # Standardize using the same scaler from training
        match_df[scaler.feature_names_in_] = scaler.transform(match_df[scaler.feature_names_in_])
        
        # Predict winner
        winner_pred_encoded = best_xgb.predict(match_df)[0]
        
        # Convert back to team name
        winner_pred = label_encoder_teams.inverse_transform([winner_pred_encoded])[0]
        
        # Decide toss winner
        if match_features["team1_toss_win"] > match_features["team2_toss_win"]:
            toss_winner = match["team1"]
        elif match_features["team1_toss_win"] < match_features["team2_toss_win"]:
            toss_winner = match["team2"]
        else:
            toss_winner = random.choice([match["team1"], match["team2"]])  # Random selection if tied
        
        # Store the prediction
        predictions.append(winner_pred)
        
        # Append match with predicted winner
        new_match = {
            "id": len(matches1) + 1,
            "season": match["season"],
            "city": match["city"],
            "date": match["date"],
            "match_type": match["match_type"],
            "venue": match["venue"],
            "team1": match["team1"],
            "team2": match["team2"],
            "toss_winner": toss_winner,
            "winner": winner_pred
        }
        
        matches1.loc[len(matches1)] = new_match
    
    return predictions, matches1

In [113]:
predictions1, matches2 = predict_matches(schedule, matches1, best_xgb, label_encoder_teams, scaler, train_feature_columns)

Processing match 1/70: Kolkata Knight Riders vs Royal Challengers Bengaluru
Processing match 2/70: Sunrisers Hyderabad vs Rajasthan Royals
Processing match 3/70: Chennai Super Kings vs Mumbai Indians
Processing match 4/70: Delhi Capitals vs Lucknow Super Giants
Processing match 5/70: Gujarat Titans vs Punjab Kings
Processing match 6/70: Rajasthan Royals vs Kolkata Knight Riders
Processing match 7/70: Sunrisers Hyderabad vs Lucknow Super Giants
Processing match 8/70: Chennai Super Kings vs Royal Challengers Bengaluru
Processing match 9/70: Gujarat Titans vs Mumbai Indians
Processing match 10/70: Delhi Capitals vs Sunrisers Hyderabad
Processing match 11/70: Rajasthan Royals vs Chennai Super Kings
Processing match 12/70: Mumbai Indians vs Kolkata Knight Riders
Processing match 13/70: Lucknow Super Giants vs Punjab Kings
Processing match 14/70: Royal Challengers Bengaluru vs Gujarat Titans
Processing match 15/70: Kolkata Knight Riders vs Sunrisers Hyderabad
Processing match 16/70: Lucknow 

In [114]:
matches2.tail()

Unnamed: 0,id,season,city,date,match_type,venue,team1,team2,toss_winner,winner,team1_won,team2_won,team1_h2h_wins,team2_h2h_wins
1053,1054,2025,Mumbai,2025-05-15,league,Wankhede Stadium,Mumbai Indians,Delhi Capitals,Mumbai Indians,Delhi Capitals,0.0,1.0,11.0,5.0
1054,1055,2025,Jaipur,2025-05-16,league,Sawai Mansingh Stadium,Rajasthan Royals,Punjab Kings,Rajasthan Royals,Rajasthan Royals,1.0,0.0,6.0,3.0
1055,1056,2025,Bengaluru,2025-05-17,league,M. Chinnaswamy Stadium,Royal Challengers Bengaluru,Kolkata Knight Riders,Royal Challengers Bengaluru,Royal Challengers Bengaluru,1.0,0.0,9.0,11.0
1056,1057,2025,Ahmedabad,2025-05-18,league,Narendra Modi Stadium,Gujarat Titans,Chennai Super Kings,Chennai Super Kings,Chennai Super Kings,0.0,1.0,1.0,1.0
1057,1058,2025,Lucknow,2025-05-18,league,Ekana Cricket Stadium,Lucknow Super Giants,Sunrisers Hyderabad,Sunrisers Hyderabad,Sunrisers Hyderabad,,,,


In [115]:
last_70_matches = matches2[['team1', 'team2', 'winner', 'date']].tail(70).copy()

In [116]:
last_70_matches.head()

Unnamed: 0,team1,team2,winner,date
988,Kolkata Knight Riders,Royal Challengers Bengaluru,Kolkata Knight Riders,2025-03-22
989,Sunrisers Hyderabad,Rajasthan Royals,Rajasthan Royals,2025-03-23
990,Chennai Super Kings,Mumbai Indians,Chennai Super Kings,2025-03-23
991,Delhi Capitals,Lucknow Super Giants,Delhi Capitals,2025-03-24
992,Gujarat Titans,Punjab Kings,Gujarat Titans,2025-03-25


In [117]:
print(predictions1)

['Kolkata Knight Riders', 'Rajasthan Royals', 'Chennai Super Kings', 'Delhi Capitals', 'Gujarat Titans', 'Rajasthan Royals', 'Sunrisers Hyderabad', 'Chennai Super Kings', 'Gujarat Titans', 'Sunrisers Hyderabad', 'Chennai Super Kings', 'Kolkata Knight Riders', 'Rajasthan Royals', 'Royal Challengers Bengaluru', 'Sunrisers Hyderabad', 'Mumbai Indians', 'Chennai Super Kings', 'Rajasthan Royals', 'Kolkata Knight Riders', 'Sunrisers Hyderabad', 'Royal Challengers Bengaluru', 'Chennai Super Kings', 'Rajasthan Royals', 'Royal Challengers Bengaluru', 'Chennai Super Kings', 'Rajasthan Royals', 'Sunrisers Hyderabad', 'Rajasthan Royals', 'Delhi Capitals', 'Chennai Super Kings', 'Kolkata Knight Riders', 'Delhi Capitals', 'Sunrisers Hyderabad', 'Royal Challengers Bengaluru', 'Delhi Capitals', 'Rajasthan Royals', 'Royal Challengers Bengaluru', 'Chennai Super Kings', 'Kolkata Knight Riders', 'Delhi Capitals', 'Sunrisers Hyderabad', 'Royal Challengers Bengaluru', 'Chennai Super Kings', 'Kolkata Knight 

In [119]:
# Count the frequency of wins
points_table = pd.DataFrame({'Team': pd.Series(predictions1).value_counts().index, 
                             'Wins': pd.Series(predictions1).value_counts().values})

# Assign 2 points per win
points_table['Points'] = points_table['Wins'] * 2

# Sort by Points in descending order
points_table = points_table.sort_values(by='Points', ascending=False).reset_index(drop=True)

print(points_table)

                          Team  Wins  Points
0             Rajasthan Royals    13      26
1          Chennai Super Kings    12      24
2          Sunrisers Hyderabad    11      22
3               Delhi Capitals    10      20
4  Royal Challengers Bengaluru    10      20
5        Kolkata Knight Riders     9      18
6               Mumbai Indians     3       6
7               Gujarat Titans     2       4


In [124]:
import pandas as pd

# Creating a list of dictionaries for custom values
data = [
    {
        "id": 1, "season": 2025, "city": "Hyderabad", "date": "2025-05-20", "time": "19:30",
        "venue": "Rajiv Gandhi International Stadium", "team1": "Rajasthan Royals", "team2": "Chennai Super Kings",
        "match_type": "T20", "isTeam1Home": False
    },
    {
        "id": 1, "season": 2025, "city": "Hyderabad", "date": "2025-05-21", "time": "19:30",
        "venue": "Rajiv Gandhi International Stadium", "team1": "Sunrisers Hyderabad", "team2": "Delhi Capitals",
        "match_type": "T20", "isTeam1Home": False
    },
]

# Creating DataFrame
df = pd.DataFrame(data)

# Display DataFrame
df.head()


Unnamed: 0,id,season,city,date,time,venue,team1,team2,match_type,isTeam1Home
0,1,2025,Hyderabad,2025-05-20,19:30,Rajiv Gandhi International Stadium,Rajasthan Royals,Chennai Super Kings,T20,False
1,1,2025,Hyderabad,2025-05-21,19:30,Rajiv Gandhi International Stadium,Sunrisers Hyderabad,Delhi Capitals,T20,False


In [125]:
#Qualifier 1 and Eliminator
predictions2, matches3 = predict_matches(df, matches2, best_xgb, label_encoder_teams, scaler, train_feature_columns)

Processing match 1/2: Rajasthan Royals vs Chennai Super Kings
Processing match 2/2: Sunrisers Hyderabad vs Delhi Capitals


In [126]:
print(predictions2)

['Rajasthan Royals', 'Sunrisers Hyderabad']


In [127]:
import pandas as pd

# Creating a list of dictionaries for custom values
data1 = [
    {
        "id": 1, "season": 2025, "city": "Kolkata", "date": "2025-05-23", "time": "19:30",
        "venue": "Eden Gardens", "team1": "Chennai Super Kings", "team2": "Sunrisers Hyderabad",
        "match_type": "T20", "isTeam1Home": False
    },
]

# Creating DataFrame
df1 = pd.DataFrame(data1)

# Display DataFrame
df1.head()

Unnamed: 0,id,season,city,date,time,venue,team1,team2,match_type,isTeam1Home
0,1,2025,Kolkata,2025-05-23,19:30,Eden Gardens,Chennai Super Kings,Sunrisers Hyderabad,T20,False


In [128]:
#Qualifier 2 prediction
predictions3, matches4 = predict_matches(df1, matches3, best_xgb, label_encoder_teams, scaler, train_feature_columns)

Processing match 1/1: Chennai Super Kings vs Sunrisers Hyderabad


In [129]:
predictions3

['Chennai Super Kings']

In [133]:
import pandas as pd

# Creating a list of dictionaries for custom values
data2 = [
    {
        "id": 1, "season": 2025, "city": "Kolkata", "date": "2025-05-25", "time": "19:30",
        "venue": "Eden Gardens", "team1": "Rajasthan Royals", "team2": "Chennai Super Kings",
        "match_type": "T20", "isTeam1Home": False
    },
]

# Creating DataFrame
df2 = pd.DataFrame(data2)

# Display DataFrame
df2.head()

Unnamed: 0,id,season,city,date,time,venue,team1,team2,match_type,isTeam1Home
0,1,2025,Kolkata,2025-05-25,19:30,Eden Gardens,Rajasthan Royals,Chennai Super Kings,T20,False


In [134]:
#Finals Prediction
predictions4, matches5 = predict_matches(df2, matches4, best_xgb, label_encoder_teams, scaler, train_feature_columns)

Processing match 1/1: Rajasthan Royals vs Chennai Super Kings


In [135]:
predictions4

['Rajasthan Royals']