In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

# File path to your CSV file
file_path = 'Project_dataset.csv'

# Selected columns excluding 'FTHG' and 'FTAG' to avoid perfect accuracy
selected_columns = [
    'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC',
    'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'HTR', 'FTR'
]

# Load the dataset
df = pd.read_csv(file_path, usecols=selected_columns)

# Separate numeric and categorical columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Impute missing values for numeric columns with mean
imputer_numeric = SimpleImputer(strategy='mean')
df[numeric_columns] = imputer_numeric.fit_transform(df[numeric_columns])

# Impute missing values for categorical columns with the most frequent value
imputer_categorical = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = imputer_categorical.fit_transform(df[categorical_columns])

# Encode categorical columns 'HTR' and 'FTR'
label_encoder = LabelEncoder()
df['HTR'] = label_encoder.fit_transform(df['HTR'])
df['FTR'] = label_encoder.fit_transform(df['FTR'])

# Separate features and target variable
X = df.drop(columns=['FTR'])
y = df['FTR']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classification models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models, store results for plotting
results = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) * 100  # Convert to percentage
    confusion = confusion_matrix(y_test, y_pred)
    results.append((model_name, accuracy, confusion))
    print(f"{model_name} Accuracy: {accuracy:.2f}%")  # Print accuracy score

# Plot the accuracy scores
plt.figure(figsize=(10, 6))
model_names = [result[0] for result in results]
accuracies = [result[1] for result in results]
sns.barplot(x=model_names, y=accuracies)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy (%)')
plt.xlabel('Models')
plt.show()

# Plot the confusion matrices
for model_name, _, confusion in results:
    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
