In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
file_name = "Updated_somali_football_data.csv"
df = pd.read_csv(file_name)

# Display first few rows to understand the dataset
print(df.head())

# Define Features (X) and Target Variable (y)
X = df[["Home_Score", "Away_Score", "Home_Goal_Diff", "Away_Goal_Diff", 
        "Home_Last_5_Wins", "Away_Last_5_Wins", "H2H_Home_Wins", "H2H_Away_Wins", "H2H_Draws"]]  # Selected Features
y = df["Result"]  # Target Variable (Win, Draw, Loss)

# Split into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of training and testing sets
print(f"Training Set Size: {X_train.shape}")  # Example Output: (708, 9)
print(f"Testing Set Size: {X_test.shape}")  # Example Output: (177, 9)


   Season        Date              Home_Team      Away_Team  Home_Score  \
0    2012  2012-06-04              Elman FC    Badbaado FC            3   
1    2012  2012-06-05             Sahafi FC     Jeenyo  FC            1   
2    2012  2012-06-07            Heegan S.C     Dekedda SC            1   
3    2012  2012-06-08   Mogadishu City Club    Horseed S.C            1   
4    2012  2012-06-11            Jeenyo  FC       Elman FC            1   

   Away_Score  Home_Goal_Diff  Away_Goal_Diff Result  Home_Last_5_Wins  \
0           1               2              -2    Win                 0   
1           1               0               0   Draw                 0   
2           1               0               0   Draw                 0   
3           1               0               0   Draw                 0   
4           1               0               0   Draw                 0   

   Away_Last_5_Wins  H2H_Home_Wins  H2H_Home_Losses  H2H_Away_Wins  \
0                 0              0

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb  # Install with: pip install xgboost

# Load dataset (Make sure the splitting is already done)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (708, 9)
Testing set size: (177, 9)


In [7]:
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)  # Train model
y_pred_log = log_reg.predict(X_test)  # Make predictions
print("Logistic Regression - Predictions:", y_pred_log[:10])  # Show first 10 predictions


Logistic Regression - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [8]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)  # Train model
y_pred_tree = tree.predict(X_test)  # Make predictions
print("Decision Tree - Predictions:", y_pred_tree[:10])


Decision Tree - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [9]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)  # Train model
y_pred_rf = rf.predict(X_test)  # Make predictions
print("Random Forest - Predictions:", y_pred_rf[:10])


Random Forest - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [10]:
knn = KNeighborsClassifier(n_neighbors=5)  # k=5
knn.fit(X_train, y_train)  # Train model
y_pred_knn = knn.predict(X_test)  # Make predictions
print("KNN - Predictions:", y_pred_knn[:10])


KNN - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [11]:
svm = SVC(kernel='linear')  # Linear kernel
svm.fit(X_train, y_train)  # Train model
y_pred_svm = svm.predict(X_test)  # Make predictions
print("SVM - Predictions:", y_pred_svm[:10])


SVM - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [16]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Convert target variable to numeric values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Convert labels to numbers
y_test_encoded = label_encoder.transform(y_test)  # Convert test labels

# Train XGBoost model (Removed 'use_label_encoder=False')
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train, y_train_encoded)  # Train model

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Convert predictions back to original labels if needed
y_pred_xgb_decoded = label_encoder.inverse_transform(y_pred_xgb)

# Print predictions
print("XGBoost - Predictions:", y_pred_xgb_decoded[:10])


XGBoost - Predictions: ['Loss' 'Loss' 'Draw' 'Win' 'Loss' 'Loss' 'Loss' 'Win' 'Win' 'Win']


In [4]:

import pandas as pd
import numpy as np

# Load dataset
file_name = "Updated_somali_football_data.csv"
df = pd.read_csv(file_name)

# Normalize team names by stripping spaces
df["Home_Team"] = df["Home_Team"].str.strip()
df["Away_Team"] = df["Away_Team"].str.strip()

# Ask user for two teams
team_1 = input("Enter Home Team: ").strip()  # Strip spaces from user input
team_2 = input("Enter Away Team: ").strip()

# Check if teams exist in dataset
if team_1 in df["Home_Team"].values and team_2 in df["Away_Team"].values:
    # Get the match data for these teams
    match_data = df[(df["Home_Team"] == team_1) & (df["Away_Team"] == team_2)]

    if not match_data.empty:
        match_data = match_data.iloc[0]  # Select the first matching row

        # Extract features (Replace with actual feature columns)
        match_features = np.array([[match_data["Home_Score"], match_data["Away_Score"], 
                                    match_data["Home_Last_5_Wins"], match_data["Away_Last_5_Wins"],
                                    match_data["H2H_Home_Wins"], match_data["H2H_Away_Wins"],
                                    match_data["H2H_Home_Losses"], match_data["H2H_Away_Losses"], 
                                    match_data["H2H_Draws"]]])

        print(f"\n🔹 Predicting Match: {team_1} vs {team_2}")

        # Make predictions using each model
        print("Logistic Regression:", log_reg.predict(match_features)[0])
        print("Decision Tree:", tree.predict(match_features)[0])
        print("Random Forest:", rf.predict(match_features)[0])
        print("KNN:", knn.predict(match_features)[0])
        print("SVM:", svm.predict(match_features)[0])
        print("XGBoost:", xgb_model.predict(match_features)[0])

    else:
        print(f"❌ No match history found for {team_1} vs {team_2}.")
else:
    print("❌ Error: One or both teams are not in the dataset. Try again.")




Enter Home Team:  vdf
Enter Away Team:  yulhj


❌ Error: One or both teams are not in the dataset. Try again.


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb

# Load dataset
file_name = "Updated_somali_football_data.csv"
df = pd.read_csv(file_name)

# Feature Selection (replace with actual features from your dataset)
features = ["Home_Score", "Away_Score", "Home_Last_5_Wins", "Away_Last_5_Wins",
            "H2H_Home_Wins", "H2H_Away_Wins", "H2H_Home_Losses", "H2H_Away_Losses", "H2H_Draws"]
target = "Result"  # Assuming 'Result' is the target variable

X = df[features]
y = df[target]

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train all six models
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

svm = SVC()
svm.fit(X_train, y_train)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

print("✅ Models trained successfully!")


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['Draw' 'Loss' 'Win']

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb

# 📌 Step 1: Load Dataset
df = pd.read_csv("Updated_somali_football_data.csv")

# 📌 Step 2: Data Cleaning (Ensure Team Names Are Consistent)
df["Home_Team"] = df["Home_Team"].str.strip().str.lower()
df["Away_Team"] = df["Away_Team"].str.strip().str.lower()

# 📌 Step 3: Define Features & Target Variable
features = ["Home_Team", "Away_Team", "Home_Score", "Away_Score", "Home_Goal_Diff", 
            "Away_Goal_Diff", "Home_Last_5_Wins", "Away_Last_5_Wins"]
target = "Result"

# Encode Team Names & Result Labels
encoder_team = LabelEncoder()
df["Home_Team"] = encoder_team.fit_transform(df["Home_Team"])
df["Away_Team"] = encoder_team.transform(df["Away_Team"])

encoder_result = LabelEncoder()
df["Result"] = encoder_result.fit_transform(df["Result"])  # Converts 'Win', 'Draw', 'Loss' to numbers

# 📌 Step 4: Split Data (80% Train, 20% Test)
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\n✅ Data split successfully!\nTraining Set Size: {X_train.shape}\nTesting Set Size: {X_test.shape}")

# 📌 Step 5: Train Models
log_reg = LogisticRegression()
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
svm = SVC()
xgb_model = xgb.XGBClassifier(eval_metric="mlogloss")  # ✅ Removed use_label_encoder=False to fix warning

models = {"Logistic Regression": log_reg, "Decision Tree": tree, "Random Forest": rf, 
          "KNN": knn, "SVM": svm, "XGBoost": xgb_model}

for name, model in models.items():
    model.fit(X_train, y_train)

print("\n✅ Models trained successfully!")

# 📌 Step 6: Predict Match Result (User Input)
def predict_match(home_team, away_team):
    home_team = home_team.strip().lower()
    away_team = away_team.strip().lower()

    if home_team not in encoder_team.classes_ or away_team not in encoder_team.classes_:
        print("\n❌ Error: One or both teams are not in the dataset. Try again.")
        return

    home_encoded = encoder_team.transform([home_team])[0]
    away_encoded = encoder_team.transform([away_team])[0]

    # ✅ Convert match_features into a DataFrame to fix Scikit-Learn warnings
    match_features = pd.DataFrame([[home_encoded, away_encoded, 0, 0, 0, 0, 0, 0]], columns=features)

    print(f"\n🔹 Predicting Match: {home_team.title()} vs {away_team.title()}")
    for name, model in models.items():
        prediction = model.predict(match_features)[0]
        predicted_label = encoder_result.inverse_transform([prediction])[0]
        print(f"{name}: {predicted_label}")

# 📌 Step 7: User Input for Prediction
team_1 = input("Enter Home Team: ").strip().lower()
team_2 = input("Enter Away Team: ").strip().lower()

predict_match(team_1, team_2)



✅ Data split successfully!
Training Set Size: (708, 8)
Testing Set Size: (177, 8)

✅ Models trained successfully!


Enter Home Team:  Badbaado FC
Enter Away Team:  	Elman FC



🔹 Predicting Match: Badbaado Fc vs Elman Fc
Logistic Regression: Draw
Decision Tree: Draw
Random Forest: Draw
KNN: Draw
SVM: Draw
XGBoost: Draw
