In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load data
train_data = pd.read_csv('../split_data/train_test_data.csv')
test_data = pd.read_csv('../split_data/validation_data.csv')

# Preprocessing
def preprocess_data(data):
    # Drop columns that are not useful for prediction
    drop_cols = ['id', 'date']  # Assuming 'id' and 'date' are not predictive
    data = data.drop(columns=drop_cols, errors='ignore')
    
    # Handle categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le  # Save encoders if needed later
    
    # Handle missing values
    data = data.fillna(data.median(numeric_only=True))
    
    return data, label_encoders

# Preprocess train and test data
train_data, train_encoders = preprocess_data(train_data)
test_data, _ = preprocess_data(test_data)

# Split features and target
X_train = train_data.drop(columns=['home_team_win'])
y_train = train_data['home_team_win']

X_test = test_data.drop(columns=['home_team_win'])
y_test = test_data['home_team_win']

# Train the model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.6f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.559169
Classification Report:
               precision    recall  f1-score   support

       False       0.52      0.46      0.49       504
        True       0.59      0.65      0.61       603

    accuracy                           0.56      1107
   macro avg       0.55      0.55      0.55      1107
weighted avg       0.56      0.56      0.56      1107



In [15]:
import pandas as pd
import numpy as np

# Load the test data
test_data_path = "/Users/chuajerome/Desktop/ML_Project/data-for-stage-1/same_season_test_data.csv"
test_data_new = pd.read_csv(test_data_path)

# Preprocess the test data
def preprocess_new_data(data, encoders):
    drop_cols = ['id', 'date']  # Drop non-predictive columns
    data = data.drop(columns=drop_cols, errors='ignore')

    # Apply label encoders to categorical columns
    for col, encoder in encoders.items():
        if col in data.columns:
            # Handle unseen labels by replacing them with 'Unknown'
            data[col] = data[col].astype(str).map(
                lambda x: encoder.transform([x])[0] if x in encoder.classes_ else len(encoder.classes_)
            )
    
    # Handle missing values
    data = data.fillna(data.median(numeric_only=True))

    return data

# Add "Unknown" as a fallback category for each encoder
train_encoders_with_unknown = {}
for col, encoder in train_encoders.items():
    train_encoders_with_unknown[col] = encoder
    train_encoders_with_unknown[col].classes_ = np.append(encoder.classes_, 'Unknown')

test_data_new_preprocessed = preprocess_new_data(test_data_new, train_encoders_with_unknown)

# Predict using the trained model
test_data_new['home_team_win'] = model.predict(test_data_new_preprocessed)

# Create the output CSV
output_csv = test_data_new[['id', 'home_team_win']]
output_csv['home_team_win'] = output_csv['home_team_win'].astype(bool)  # Convert to boolean

# Save the output
output_path = "/Users/chuajerome/Desktop/ML_Project/data-for-stage-1/predicted_home_team_win.csv"
output_csv.to_csv(output_path, index=False)

print(f"Output saved to {output_path}")


Output saved to /Users/chuajerome/Desktop/ML_Project/data-for-stage-1/predicted_home_team_win.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_csv['home_team_win'] = output_csv['home_team_win'].astype(bool)  # Convert to boolean
