In [7]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
import joblib

# --- 1. Load Dataset ---
try:
    df_matches = pd.read_csv(r'C:\Users\sanke\Downloads\matches.csv')
except FileNotFoundError:
    print("Error: 'matches.csv' not found.")
    df_matches = pd.DataFrame()

if not df_matches.empty:
    # --- 2. Clean Data ---
    df_matches.dropna(subset=['winner'], inplace=True)
    # Ensure season is numeric
    df_matches['season'] = pd.to_numeric(df_matches['season'], errors='coerce')
    df_matches.dropna(subset=['season'], inplace=True)
    df_matches['season'] = df_matches['season'].astype(int)

    
    df_matches = df_matches.sort_values(['season', 'date'])  # chronological

    # Relevant columns
    df_model = df_matches[['team1','team2','venue','toss_winner','toss_decision','season','winner']].copy()

    # --- 3. Feature Engineering ---

    # Toss win indicator
    df_model['toss_win_equals_winner'] = 0  # default, we won't bias training

    # Head-to-head difference
    def h2h_diff(team1, team2, df):
        past = df[((df['team1']==team1) & (df['team2']==team2)) | ((df['team1']==team2) & (df['team2']==team1))]
        team1_wins = sum(past['winner']==team1)
        team2_wins = sum(past['winner']==team2)
        return team1_wins - team2_wins

    df_model['h2h_diff'] = df_model.apply(lambda row: h2h_diff(row['team1'], row['team2'], df_matches), axis=1)

    # Recent form: last 3 matches win rate
    def recent_winrate(team, season, df, last_n=3):
        past = df[(df['season']<season) & ((df['team1']==team)|(df['team2']==team))]
        past = past.tail(last_n)
        if past.empty:
            return 0.5
        wins = sum(past['winner']==team)
        return wins/len(past)

    df_model['team1_recent_winrate'] = df_model.apply(lambda row: recent_winrate(row['team1'], row['season'], df_matches), axis=1)
    df_model['team2_recent_winrate'] = df_model.apply(lambda row: recent_winrate(row['team2'], row['season'], df_matches), axis=1)

    # --- 4. Train CatBoost Model ---
    X = df_model.drop('winner', axis=1)
    y = df_model['winner']

    categorical_cols = ['team1','team2','venue','toss_winner','toss_decision','season']

    model = CatBoostClassifier(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        loss_function='MultiClass',
        cat_features=categorical_cols,
        random_seed=42,
        verbose=100
    )

    model.fit(X, y)

    # --- 5. Evaluate ---
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"\n✅ Average CV Accuracy: {scores.mean()*100:.2f}%")

    # --- 6. Save Model ---
    joblib.dump(model, "ipl_winner_model.pkl")
    print("✅ Model saved as 'ipl_winner_model.pkl'")

    # --- 7. Prediction Function ---
    def predict_match_winner(team1, team2, venue, toss_winner, toss_decision, season):
        # Auto-calculate features
        team1_recent = recent_winrate(team1, season, df_matches)
        team2_recent = recent_winrate(team2, season, df_matches)
        h2h = h2h_diff(team1, team2, df_matches)
        toss_win_eq = 1 if toss_winner in [team1, team2] else 0

        input_data = pd.DataFrame([{
            'team1': team1,
            'team2': team2,
            'venue': venue,
            'toss_winner': toss_winner,
            'toss_decision': toss_decision,
            'season': season,
            'toss_win_equals_winner': toss_win_eq,
            'h2h_diff': h2h,
            'team1_recent_winrate': team1_recent,
            'team2_recent_winrate': team2_recent
        }])

        return model.predict(input_data)[0]

    # --- Example ---
  


0:	learn: 2.8827860	total: 668ms	remaining: 5m 33s
100:	learn: 1.1594889	total: 1m 48s	remaining: 7m 8s
200:	learn: 0.6473128	total: 3m 37s	remaining: 5m 23s
300:	learn: 0.4113580	total: 4m 26s	remaining: 2m 55s
400:	learn: 0.2907240	total: 5m 16s	remaining: 1m 18s
499:	learn: 0.2200052	total: 6m 7s	remaining: 0us
0:	learn: 2.8913448	total: 446ms	remaining: 3m 42s
100:	learn: 1.2223567	total: 43.3s	remaining: 2m 50s
200:	learn: 0.6687697	total: 1m 27s	remaining: 2m 9s
300:	learn: 0.4128884	total: 2m 13s	remaining: 1m 28s
400:	learn: 0.2836263	total: 3m	remaining: 44.5s
499:	learn: 0.2102012	total: 3m 48s	remaining: 0us
0:	learn: 2.8953130	total: 572ms	remaining: 4m 45s
100:	learn: 1.2215385	total: 44.6s	remaining: 2m 56s
200:	learn: 0.6708819	total: 1m 29s	remaining: 2m 12s
300:	learn: 0.4125995	total: 2m 14s	remaining: 1m 28s
400:	learn: 0.2831099	total: 3m	remaining: 44.6s
499:	learn: 0.2106490	total: 3m 47s	remaining: 0us
0:	learn: 2.9059468	total: 558ms	remaining: 4m 38s
100:	learn

In [8]:
import joblib

# Save the model
joblib.dump(model, "ipl_winner_model.pkl")
print("✅ Model saved as 'ipl_winner_model.pkl'")


✅ Model saved as 'ipl_winner_model.pkl'
