In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load and preprocess the dataset
data = pd.read_csv('iplmatches.csv')

# Fill missing values
data['winner'].fillna('Draw', inplace=True)
data['city'].fillna('Dubai', inplace=True)

# Replace team names with shortened versions
data.replace(
    [
        'Mumbai Indians', 'Kolkata Knight Riders', 'Royal Challengers Bangalore', 'Deccan Chargers', 'Chennai Super Kings',
        'Rajasthan Royals', 'Delhi Daredevils', 'Delhi Capitals', 'Gujarat Lions', 'Kings XI Punjab',
        'Sunrisers Hyderabad', 'Rising Pune Supergiants', 'Rising Pune Supergiant', 'Kochi Tuskers Kerala', 'Pune Warriors'
    ],
    ['MI', 'KKR', 'RCB', 'DC', 'CSK', 'RR', 'DD', 'DD', 'GL', 'KXIP', 'SRH', 'RPS', 'RPS', 'KTK', 'PW'], inplace=True
)

# Encoding categorical variables manually
encode = {
    'team1': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11, 'KTK': 12, 'PW': 13},
    'team2': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11, 'KTK': 12, 'PW': 13},
    'toss_winner': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11, 'KTK': 12, 'PW': 13},
    'winner': {'MI': 1, 'KKR': 2, 'RCB': 3, 'DC': 4, 'CSK': 5, 'RR': 6, 'DD': 7, 'GL': 8, 'KXIP': 9, 'SRH': 10, 'RPS': 11, 'KTK': 12, 'PW': 13, 'Draw': 14}
}
data.replace(encode, inplace=True)

# Label encode categorical columns (city, toss_decision, venue)
var_mod = ['city', 'toss_decision', 'venue']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

# Selecting relevant features and the target variable
data = data[['team1', 'team2', 'city', 'toss_decision', 'toss_winner', 'venue', 'winner']]
X = data[['team1', 'team2', 'city', 'toss_decision', 'toss_winner', 'venue']]
y = data['winner']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Apply column transformation (one-hot encoding for categorical features)
transformer = ColumnTransformer(
    [
        ('transformer', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), ['team1', 'team2', 'city'])
    ],
    remainder='passthrough'  # Leave other columns unchanged
)

# Create the pipeline with preprocessing and Random Forest model
rf_pipeline = Pipeline(steps=[
    ('step1', transformer),  # Apply column transformer
    ('step2', StandardScaler()),  # Standardize features
    ('step3', RandomForestRegressor(n_estimators=100, random_state=1))  # Random Forest model
])

# Fit the pipeline to the training data
rf_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_pipeline.predict(X_test)

# Calculate R-squared and Mean Absolute Error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the results
print(f"R-squared: {r2:.3f}")
print(f"Mean Absolute Error: {mae:.3f}")

# Save the pipeline model
with open('random_forest_pipeline_model.pkl', 'wb') as file:
    pickle.dump(rf_pipeline, file)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['winner'].fillna('Draw', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['city'].fillna('Dubai', inplace=True)
  data.replace(encode, inplace=True)


R-squared: 0.212
Mean Absolute Error: 1.985
