In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Cleaned_Data.csv") 

In [3]:
df.replace(['', '-', ' '], np.nan, inplace=True)
df.fillna(0, inplace=True)

In [4]:
numeric_cols = ['Mat', 'Runs', 'HS', 'Batting Ave', '100', 'Wkts', 'Bowling Ave', '5', 'Ct']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df.fillna(0, inplace=True)

In [5]:
df['Runs_per_Match'] = df['Runs'] / (df['Mat'] + 1e-5)
df['Wkts_per_Match'] = df['Wkts'] / (df['Mat'] + 1e-5)
df['Batting_Impact'] = df['Batting Ave'] * df['100']
df['Bowling_Impact'] = df['Wkts'] / (df['Bowling Ave'] + 1e-5)
df['Allrounder_Score'] = df['Batting_Impact'] + df['Bowling_Impact']
df['HS_ratio'] = df['HS'] / (df['Runs'] + 1e-5)
df['Ct_per_Match'] = df['Ct'] / (df['Mat'] + 1e-5)

In [6]:
features = [
    'Mat', 'Runs', 'HS', 'Batting Ave', '100', 'Wkts', 'Bowling Ave', '5', 'Ct',
    'Runs_per_Match', 'Wkts_per_Match', 'Batting_Impact', 'Bowling_Impact',
    'Allrounder_Score', 'HS_ratio', 'Ct_per_Match'
]

X = df[features].copy()

In [7]:
le_category = LabelEncoder()
df['Category_encoded'] = le_category.fit_transform(df['Category'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, df['Category_encoded'], test_size=0.2, random_state=42, stratify=df['Category_encoded']
)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [13]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
                             objective='multi:softmax', num_class=len(np.unique(y_train)))
}

In [14]:
print("=== 🔍 Model Benchmark: Category Prediction ===")
results = []
for name, model in models.items():
    try:
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, preds)
        results.append((name, acc))
        print(f"{name:25s} ➤ Accuracy: {acc:.4f}")
    except Exception as e:
        print(f"{name:25s} ➤ ❌ Error: {e}")

=== 🔍 Model Benchmark: Category Prediction ===
Logistic Regression       ➤ Accuracy: 0.6429
K-Nearest Neighbors       ➤ Accuracy: 0.7857
SVM                       ➤ Accuracy: 0.6429
Decision Tree             ➤ Accuracy: 0.8571
Random Forest             ➤ Accuracy: 0.9286
Naive Bayes               ➤ Accuracy: 0.7143
Gradient Boosting         ➤ Accuracy: 0.7143
XGBoost                   ➤ Accuracy: 0.8571


In [15]:
best_model = max(results, key=lambda x: x[1])
print(f"\n✅ Best Model: {best_model[0]} with Accuracy = {best_model[1]:.4f}")


✅ Best Model: Random Forest with Accuracy = 0.9286


In [16]:
import pickle

In [17]:
best_model_name = best_model[0]
best_model_object = models[best_model_name]

# Save to a pickle file
with open(f"{best_model_name.replace(' ', '_').lower()}_model.pkl", "wb") as file:
    pickle.dump(best_model_object, file)

print(f"\n✅ Saved '{best_model_name}' model to file: {best_model_name.replace(' ', '_').lower()}_model.pkl")


✅ Saved 'Random Forest' model to file: random_forest_model.pkl
