In [17]:
%pip install scikit-learn==1.7.1

Note: you may need to restart the kernel to use updated packages.


In [18]:
%pip install fsspec

Note: you may need to restart the kernel to use updated packages.


In [19]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [27]:
# 13-08-2025
# Shaunak_AI_ML_Internship_Task_3
# IPL_Winning_Team_Prediction
# Train model on Decision Tree, Linear Regression, SVM, Lasso, Random Forest, Neural Network

# Importing Necessary Libraries
# ------------------------------
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

print(f"Numpy Version: {np.__version__}")

# ------------------------------
# Import dataset
# ------------------------------
data = pd.read_csv(
    'C://Users//kunde//Desktop//Virtual internship//InternPe//Assignments//fresh task 3//ipl_dataset.csv'
)
print(f"Dataset successfully imported with shape: {data.shape}")

# ------------------------------
# Remove irrelevant columns
# ------------------------------
irrelevant = ['mid', 'date', 'batsman', 'bowler', 'striker', 'non-striker']
print(f"Before removing irrelevant columns: {data.shape}")
data = data.drop(irrelevant, axis=1)
print(f"After removing irrelevant columns: {data.shape}")

# ------------------------------
# Keep only consistent teams
# ------------------------------
const_teams = [
    'Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
    'Delhi Daredevils', 'Sunrisers Hyderabad'
]
print(f"Before removing inconsistent teams: {data.shape}")
data = data[(data['batting_team'].isin(const_teams)) & (data['bowling_team'].isin(const_teams))]
print(f"After removing inconsistent teams: {data.shape}")

# ------------------------------
# Remove early overs (< 5)
# ------------------------------
print(f"Before removing early overs: {data.shape}")
data = data[data['overs'] >= 5.0]
print(f"After removing early overs: {data.shape}")

# ------------------------------
# Feature Engineering for Classification
# ------------------------------
# Create a new binary target variable 'is_win'
# This is a simplification. A real-world model would use a dataset with win/loss outcomes.
median_score = data['total'].median()
data['is_win'] = (data['total'] > median_score).astype(int)
data = data.drop('total', axis=1)

# ------------------------------
# Separate Features and Labels
# ------------------------------
labels = data['is_win']
features = data.drop('is_win', axis=1)

# ------------------------------
# Encode categorical variables manually for robustness
# ------------------------------
categorical_features = ['batting_team', 'bowling_team', 'venue']
numerical_features = ['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5']

# Create a OneHotEncoder for the categorical features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = one_hot_encoder.fit_transform(features[categorical_features])

# Create a DataFrame from the encoded features
ohe_feature_names = one_hot_encoder.get_feature_names_out(categorical_features)
ohe_df = pd.DataFrame(encoded_features, columns=ohe_feature_names, index=features.index)

# Create a DataFrame from the numerical features
numerical_df = features[numerical_features]

# Concatenate the two DataFrames to create the final feature set
df = pd.concat([ohe_df, numerical_df], axis=1)

# ------------------------------
# Split features & labels
# ------------------------------
train_features, test_features, train_labels, test_labels = train_test_split(
    df, labels, test_size=0.20, shuffle=True
)
print(f"Training set: {train_features.shape}")
print(f"Testing set: {test_features.shape}")

# ------------------------------
# Save training columns
# ------------------------------
joblib.dump(train_features.columns, "training_columns_shaunak.joblib")
print("✅ Training columns saved as training_columns_shaunak.joblib")

# ------------------------------
# Keep track of models
# ------------------------------
models = dict()

# ------------------------------
# Random Forest (Main Model) - now a Classifier
# ------------------------------
forest = RandomForestClassifier(random_state=42)
forest.fit(train_features, train_labels)

# Evaluate with classification metrics
y_pred = forest.predict(test_features)
accuracy = accuracy_score(test_labels, y_pred)
models["Random Forest"] = accuracy * 100
print("---- Random Forest Evaluation ----")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision_score(test_labels, y_pred):.4f}")
print(f"Recall: {recall_score(test_labels, y_pred):.4f}")
print(f"F1-Score: {f1_score(test_labels, y_pred):.4f}")

# ------------------------------
# Save Random Forest classifier model
# ------------------------------
joblib.dump(forest, "forest_classifier_shaunak.joblib")
print("✅ Random Forest classifier model saved as forest_classifier_shaunak.joblib")

# Print sklearn version
print("scikit-learn version:", sklearn.__version__)

Numpy Version: 2.3.2
Dataset successfully imported with shape: (76014, 15)
Before removing irrelevant columns: (76014, 15)
After removing irrelevant columns: (76014, 9)
Before removing inconsistent teams: (76014, 9)
After removing inconsistent teams: (53811, 9)
Before removing early overs: (53811, 9)
After removing early overs: (40108, 9)
Training set: (32086, 52)
Testing set: (8022, 52)
✅ Training columns saved as training_columns_shaunak.joblib
---- Random Forest Evaluation ----
Accuracy: 0.9895
Precision: 0.9884
Recall: 0.9894
F1-Score: 0.9889
✅ Random Forest classifier model saved as forest_classifier_shaunak.joblib
scikit-learn version: 1.7.1
