In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib


In [40]:
df = pd.read_csv('ipl_cleaned.csv')
df.head()

df.columns

Index(['Unnamed: 0', 'batting_team', 'bowling_team', 'venue', 'inning', 'over',
       'ball', 'balls_bowled', 'runs_total', 'wicket_kind', 'current_runs',
       'current_wickets', 'final_score', 'batting_team_encoded',
       'bowling_team_encoded', 'venue_encoded', 'balls_remaining',
       'current_run_rate', 'team_avg_score', 'bowling_avg_conceded',
       'venue_avg_score'],
      dtype='object')

In [41]:
# Define numerical features (excluding alphabetical columns and target)
feature_columns = [
    'inning', 'balls_bowled', 'runs_total', 'current_runs', 'current_wickets',
    'batting_team_encoded', 'bowling_team_encoded', 'venue_encoded',
    'balls_remaining', 'current_run_rate', 'team_avg_score', 'bowling_avg_conceded', 
    'venue_avg_score'
]

# Features and target
X = df[feature_columns]
y = df['final_score']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the final XGBoost model (tuned parameters)
final_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1,
    reg_alpha=0,
    random_state=42
)

In [43]:
# Train the model
print("Training Final XGBoost Model...")
final_model.fit(X_train, y_train)

Training Final XGBoost Model...


In [44]:
# Predict and evaluate
y_pred = final_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nFinal XGBoost Results:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")



Final XGBoost Results:
MAE: 24.03
MSE: 1635.01
RMSE: 40.44
R²: 0.97


In [45]:
# Save the model
joblib.dump(final_model, 'xgboost_ipl_model.pkl')
print("Model saved as 'xgboost_ipl_model.pkl'")


Model saved as 'xgboost_ipl_model.pkl'


In [47]:
from sklearn.preprocessing import LabelEncoder
# Create and save encoder mappings (using original columns for reference)
le_batting = LabelEncoder()
le_bowling = LabelEncoder()
le_venue = LabelEncoder()

# Fit encoders on original columns (assuming they match the encoded values)
le_batting.fit(df['batting_team'])
le_bowling.fit(df['bowling_team'])
le_venue.fit(df['venue'])

# Verify encoder consistency with encoded columns
batting_encoded_check = le_batting.transform(df['batting_team'])
bowling_encoded_check = le_bowling.transform(df['bowling_team'])
venue_encoded_check = le_venue.transform(df['venue'])

In [48]:
if not (df['batting_team_encoded'].equals(pd.Series(batting_encoded_check)) and
        df['bowling_team_encoded'].equals(pd.Series(bowling_encoded_check)) and
        df['venue_encoded'].equals(pd.Series(venue_encoded_check))):
    print("Warning: Encoder mappings may not match pre-encoded columns!")

# Save encoders
joblib.dump(le_batting, 'batting_team_encoder.pkl')
joblib.dump(le_bowling, 'bowling_team_encoder.pkl')
joblib.dump(le_venue, 'venue_encoder.pkl')
print("Encoders saved as 'batting_team_encoder.pkl', 'bowling_team_encoder.pkl', 'venue_encoder.pkl'")

# Feature importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:\n", importance)

Encoders saved as 'batting_team_encoder.pkl', 'bowling_team_encoder.pkl', 'venue_encoder.pkl'

Feature Importance:
                  Feature  Importance
9       current_run_rate    0.550545
12       venue_avg_score    0.153967
3           current_runs    0.113939
8        balls_remaining    0.062155
1           balls_bowled    0.036082
10        team_avg_score    0.018750
11  bowling_avg_conceded    0.016053
0                 inning    0.015706
7          venue_encoded    0.007344
4        current_wickets    0.006866
5   batting_team_encoded    0.006646
6   bowling_team_encoded    0.006489
2             runs_total    0.005457
