# Predict Winner Before Match Begins

This notebook trains a Random Forest model using only features available before the match begins.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('/mnt/data/gold_df.csv')
# Drop rows with missing winner info
df = df.dropna(subset=['winner'])

### Feature Selection

In [None]:
features = ['team1', 'team2', 'venue', 'city', 'season']
target = 'winner'
df_model = df[features + [target]].dropna()

# Label Encoding
label_encoders = {}
for col in features + [target]:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le

X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Train Random Forest Model

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

### Feature Importance Visualization

In [None]:
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns)
plt.figure(figsize=(10, 5))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Feature Importances')
plt.tight_layout()
plt.show()