In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
# Load data
matches_df = pd.read_csv('matches.csv')
deliveries_df = pd.read_csv('deliveries.csv')

In [5]:
# Step 1: Feature Engineering
# Toss winner feature
matches_df['toss_winner_is_team1'] = (matches_df['toss_winner'] == matches_df['team1']).astype(int)

# Head-to-head win ratio
def head_to_head_win_ratio(matches, team1, team2):
    h2h_matches = matches[((matches['team1'] == team1) & (matches['team2'] == team2)) |
                          ((matches['team1'] == team2) & (matches['team2'] == team1))]
    team1_wins = h2h_matches[h2h_matches['winner'] == team1].shape[0]
    team2_wins = h2h_matches[h2h_matches['winner'] == team2].shape[0]
    return team1_wins / (team1_wins + team2_wins) if (team1_wins + team2_wins) > 0 else 0

matches_df['team1_win_ratio'] = matches_df.apply(
    lambda x: head_to_head_win_ratio(matches_df, x['team1'], x['team2']), axis=1
)

# Drop rows with missing target variable
matches_df = matches_df.dropna(subset=['winner'])

# Map target variable to binary: 1 if team1 wins, 0 otherwise
matches_df['target'] = (matches_df['winner'] == matches_df['team1']).astype(int)

In [6]:
# Step 2: Prepare Data for Training
features = ['toss_winner_is_team1', 'team1_win_ratio']
X = matches_df[features]
y = matches_df['target']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [7]:
# Step 3: Train the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100,max_depth=5,min_samples_split=2,min_samples_leaf=2,random_state=42)      
rf_model.fit(X_train, y_train)

# Step 4: Evaluate the Model
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6330275229357798
Confusion Matrix:
 [[26 23]
 [17 43]]


In [8]:
# Step 5: Prediction Function
def predict_match_winner(team1, team2, toss_winner):
    # Calculate head-to-head win ratio for the input teams
    win_ratio = head_to_head_win_ratio(matches_df, team1, team2)
    
    # Determine toss_winner_is_team1
    toss_winner_is_team1 = 1 if toss_winner == team1 else 0
    
    # Prepare input features for the model
    input_features = pd.DataFrame({
        'toss_winner_is_team1': [toss_winner_is_team1],
        'team1_win_ratio': [win_ratio]
    })
    
    # Predict winner
    prediction = rf_model.predict(input_features)
    predicted_winner = team1 if prediction[0] == 1 else team2
    return predicted_winner

In [9]:
# Example Usage
team1 = 'Chennai Super Kings'
team2 = 'Kolkata Knight Riders'
toss_winner = 'Kolkata Knight Riders'

predicted_winner = predict_match_winner(team1, team2, toss_winner)
print(f"Predicted Winner: {predicted_winner}")

Predicted Winner: Chennai Super Kings


In [10]:
import joblib

# Assuming 'rf_model' is your trained RandomForest model
joblib.dump(rf_model, 'cricket_match_predictor_model.pkl')



['cricket_match_predictor_model.pkl']