<a href="https://colab.research.google.com/github/Ronit-0005/Internpe_ml/blob/main/IPL_Match_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [40]:
df = pd.read_csv('ipl_colab.csv')


In [41]:
df.head()

Unnamed: 0,mid,date,venue,batting_team,bowling_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


## Data Information


In [42]:
df =df.drop_duplicates()


In [43]:
df.shape


(76014, 15)

Handle missing values by filling with 0 where appropriate



In [44]:
df['runs'] = df['runs'].fillna(0)
df['wickets'] = df['wickets'].fillna(0)

In [45]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   batting_team    76014 non-null  object 
 4   bowling_team    76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


In [46]:
df.describe()


Unnamed: 0,mid,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
count,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0
mean,308.62774,74.889349,2.415844,9.783068,33.216434,1.120307,24.962283,8.869287,160.901452
std,178.156878,48.823327,2.015207,5.772587,14.914174,1.053343,20.079752,10.795742,29.246231
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0
25%,154.0,34.0,1.0,4.6,24.0,0.0,10.0,1.0,142.0
50%,308.0,70.0,2.0,9.6,34.0,1.0,20.0,5.0,162.0
75%,463.0,111.0,4.0,14.6,43.0,2.0,35.0,13.0,181.0
max,617.0,263.0,10.0,19.6,113.0,7.0,175.0,109.0,263.0


# --- Data Cleaning and Preprocessing ---


In [47]:
# Remove irrelevant columns that are too specific for a generalized model
# or are redundant.

In [48]:
df.drop(columns=['mid', 'batsman', 'bowler', 'striker', 'non-striker'], inplace=True)


In [49]:
df.drop(columns=['date'], inplace=True)


In [50]:
print("\nData after dropping irrelevant columns:")
print(df.head())


Data after dropping irrelevant columns:
                   venue           batting_team                 bowling_team  \
0  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
1  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
2  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
3  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
4  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   

   runs  wickets  overs  runs_last_5  wickets_last_5  total  
0     1        0    0.1            1               0    222  
1     1        0    0.2            1               0    222  
2     2        0    0.2            2               0    222  
3     2        0    0.3            2               0    222  
4     2        0    0.4            2               0    222  


In [51]:
consistent_teams = {
    'Deccan Chargers': 'Sunrisers Hyderabad',
    'Pune Warriors': 'Rising Pune Supergiant',
    'Gujarat Lions': 'Gujarat Titans'
}

In [52]:
df['batting_team'] = df['batting_team'].replace(consistent_teams)
df['bowling_team'] = df['bowling_team'].replace(consistent_teams)

Remove teams that are no longer playing to simplify the model.



In [53]:
current_teams = [
    'Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
    'Mumbai Indians', 'Sunrisers Hyderabad', 'Kings XI Punjab',
    'Royal Challengers Bangalore', 'Delhi Capitals', 'Gujarat Titans',
    'Lucknow Super Giants'
]

In [54]:
# 'Kings XI Punjab' is now 'Punjab Kings'. We'll keep the old name as it appears in the data.
# 'Delhi Daredevils' is now 'Delhi Capitals'.

In [55]:
df = df[df['batting_team'].isin(current_teams)]
df = df[df['bowling_team'].isin(current_teams)]

In [56]:
print(f"\nShape of data after filtering for current teams: {df.shape}")



Shape of data after filtering for current teams: (49526, 9)


In [57]:
# Remove the first 5 overs of each match as there is not enough data
# for 'runs_last_5' and 'wickets_last_5' to be meaningful.
df = df[df['overs'] >= 5.0]

In [58]:
print(f"Shape of data after removing first 5 overs: {df.shape}")
print("\nCleaned Data Head:")
print(df.head())

Shape of data after removing first 5 overs: (36932, 9)

Cleaned Data Head:
                    venue           batting_team                 bowling_team  \
32  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
33  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
34  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
35  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   
36  M Chinnaswamy Stadium  Kolkata Knight Riders  Royal Challengers Bangalore   

    runs  wickets  overs  runs_last_5  wickets_last_5  total  
32    61        0    5.1           59               0    222  
33    61        1    5.2           59               1    222  
34    61        1    5.3           59               1    222  
35    61        1    5.4           59               1    222  
36    61        1    5.5           58               1    222  


# --- 2. Feature Engineering and Final Preparation ---


In [59]:
X = df.drop(columns=['total'])
y = df['total']

In [60]:
# Identify categorical and numerical features
categorical_features = ['venue', 'batting_team', 'bowling_team']
numerical_features = [col for col in X.columns if col not in categorical_features]


In [61]:
print("\nFeatures (X) and Target (y) have been defined.")
print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)


Features (X) and Target (y) have been defined.
Categorical Features: ['venue', 'batting_team', 'bowling_team']
Numerical Features: ['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5']


# --- 3. Model Building Pipeline ---


##### Create a preprocessing pipeline for the features.
##### - OneHotEncoder handles categorical variables.
##### - 'passthrough' leaves the numerical variables untouched (as they are already scaled).

In [62]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (numerical ones)
)

In [63]:
# Create the full model pipeline
# This pipeline will first preprocess the data and then apply the regression model.
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [64]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [65]:
print(f"\nData split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")


Data split into training and testing sets.
Training set size: 29545 samples
Testing set size: 7387 samples


In [66]:
# Train the model
print("\nTraining the RandomForestRegressor model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training the RandomForestRegressor model...
Model training complete.


# --- 4. Model Evaluation ---


In [67]:
y_pred = model_pipeline.predict(X_test)

In [68]:
# Calculate and print evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [69]:
print("\n--- Model Evaluation ---")
print(f"R-squared (R2) Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} runs")
print("------------------------")
print("An R2 score of >0.75 and MAE of <15 runs is generally considered a good result for this problem.")



--- Model Evaluation ---
R-squared (R2) Score: 0.9568
Mean Absolute Error (MAE): 3.10 runs
------------------------
An R2 score of >0.75 and MAE of <15 runs is generally considered a good result for this problem.


### --- 5. Trying a Different Model (Ridge Regression) ---


In [70]:

# Create a pipeline with Ridge regression
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

# Train the Ridge model
print("\nTraining the Ridge Regression model...")
ridge_pipeline.fit(X_train, y_train)
print("Model training complete.")

# Evaluate the Ridge model
y_pred_ridge = ridge_pipeline.predict(X_test)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("\n--- Ridge Model Evaluation ---")
print(f"R-squared (R2) Score: {r2_ridge:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ridge:.2f} runs")
print("------------------------------")
print("\nComparing models, RandomForestRegressor appears to be more accurate for this task.")



Training the Ridge Regression model...
Model training complete.

--- Ridge Model Evaluation ---
R-squared (R2) Score: 0.6532
Mean Absolute Error (MAE): 12.39 runs
------------------------------

Comparing models, RandomForestRegressor appears to be more accurate for this task.


# --- 6. Saving the Model ---

In [72]:
import pickle
with open('ipl_score_predictor_rf.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

In [73]:
print("\nRandomForest model saved as 'ipl_score_predictor_rf.pkl'")



RandomForest model saved as 'ipl_score_predictor_rf.pkl'


In [74]:
def predict_score(venue, batting_team, bowling_team, runs, wickets, overs, runs_last_5, wickets_last_5):
    """
    Predicts the final score using the trained model.
    """
    # Create a DataFrame from the input
    input_data = pd.DataFrame({
        'venue': [venue],
        'batting_team': [batting_team],
        'bowling_team': [bowling_team],
        'runs': [runs],
        'wickets': [wickets],
        'overs': [overs],
        'runs_last_5': [runs_last_5],
        'wickets_last_5': [wickets_last_5]
    })
    predicted_score = model_pipeline.predict(input_data)[0]

    return int(predicted_score)

In [75]:
# Example Prediction
print("\n--- Example Prediction ---")
example_venue = 'M Chinnaswamy Stadium'
example_batting_team = 'Royal Challengers Bangalore'
example_bowling_team = 'Mumbai Indians'
example_runs = 85
example_wickets = 2
example_overs = 10.2
example_runs_last_5 = 45
example_wickets_last_5 = 1

predicted_total = predict_score(
    example_venue, example_batting_team, example_bowling_team,
    example_runs, example_wickets, example_overs,
    example_runs_last_5, example_wickets_last_5
)

print(f"Predicted Final Score for the example scenario: {predicted_total}")
print("--------------------------")


--- Example Prediction ---
Predicted Final Score for the example scenario: 170
--------------------------
