In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load data
train_data = pd.read_csv('../split_data/train_test_data.csv')
test_data = pd.read_csv('../split_data/validation_data.csv')

# Preprocessing
def preprocess_data(data):
    # Drop columns that are not useful for prediction
    drop_cols = ['id', 'date']  # Assuming 'id' and 'date' are not predictive
    data = data.drop(columns=drop_cols, errors='ignore')
    
    # Handle categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le  # Save encoders if needed later
    
    # Handle missing values
    data = data.fillna(data.median(numeric_only=True))
    
    return data, label_encoders

# Preprocess train and test data
train_data, train_encoders = preprocess_data(train_data)
test_data, _ = preprocess_data(test_data)

# Split features and target
X_train = train_data.drop(columns=['home_team_win'])
y_train = train_data['home_team_win']

X_test = test_data.drop(columns=['home_team_win'])
y_test = test_data['home_team_win']

# Train the model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.56
Classification Report:
               precision    recall  f1-score   support

       False       0.52      0.46      0.49       504
        True       0.59      0.65      0.61       603

    accuracy                           0.56      1107
   macro avg       0.55      0.55      0.55      1107
weighted avg       0.56      0.56      0.56      1107

