# Introduction

# Iniatial

1. Import Necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

2. Load the Datasets


In [None]:
all_leagues = pd.read_csv('clubs_n.csv')
players_df = pd.read_csv('/content/playerss.csv')
doctors_df = pd.read_csv('doctorsRa.csv')
coachs_df = pd.read_csv('cleaned_coaches (1).csv')
analyst_df = pd.read_csv('DataAnalyst_Modified.csv')

# 3. Data Preprocessing

In [None]:
# Standardize column names
def standardize_column_names(df):
    df.columns = df.columns.str.replace('possession(%)', 'possession(%)')
    df.columns = df.columns.str.replace('pass_accuracy(%)', 'pass_accuracy(%)')
    return df

all_leagues = standardize_column_names(all_leagues)

# Handle missing values
def handle_missing_values(df):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    cat_cols = df.select_dtypes(include=['object']).columns

    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])
    return df

all_leagues = handle_missing_values(all_leagues)

# Remove duplicates
all_leagues = all_leagues.drop_duplicates()

#4. Data Understanding


In [None]:
# Drop the 'total_market_value' column
all_leagues = all_leagues.drop(columns=['total_market_value','coach_name','club_standardized_y','url','filename'])
# Impute missing values in clubs dataset
all_leagues['average_age'] = all_leagues['average_age'].fillna(all_leagues['average_age'].mean())
all_leagues['foreigners_percentage'] = all_leagues['foreigners_percentage'].fillna(all_leagues['foreigners_percentage'].mean())

# Verify the result
missing_club_data_after = all_leagues.isna().sum()
print("🔹 Missing Data in Clubs Dataset After Imputation:")
print(missing_club_data_after[missing_club_data_after > 0])

🔹 Missing Data in Clubs Dataset After Imputation:
Series([], dtype: int64)


In [None]:
# Check for missing values
print(players_df.isnull().sum().sum())

0


In [None]:
# Step 1: Identify the percentage of missing values per column
missing_percentage = players_df.isnull().mean() * 100

# Step 2: Drop columns with more than 50% missing values
cols_to_drop = missing_percentage[missing_percentage > 50].index
players_df.drop(columns=cols_to_drop, inplace=True)

# Step 3: Separate numerical and categorical columns
numerical_cols = players_df.select_dtypes(include='number').columns
categorical_cols = players_df.select_dtypes(include='object').columns

# Step 4: Impute numerical columns with median
for col in numerical_cols:
    if players_df[col].isnull().sum() > 0:
        players_df[col].fillna(players_df[col].median(), inplace=True)

# Step 5: Impute categorical columns with mode
for col in categorical_cols:
    if players_df[col].isnull().sum() > 0:
        players_df[col].fillna(players_df[col].mode()[0], inplace=True)

# Step 6: Final count of missing values
final_missing = players_df.isnull().sum().sum()
final_missing

np.int64(0)

In [None]:
# Check for missing values
print(coachs_df.isnull().sum().sum())

36


In [None]:
# Check for missing values
print(players_df.isnull().sum().sum())

0


In [None]:
# Check for missing values
print(doctors_df.isnull().sum().sum())

0


In [None]:
# Identify numerical columns
numeric_cols = all_leagues.select_dtypes(include=['int64', 'float64']).columns
from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max Scaling
scaler = MinMaxScaler()
all_leagues[numeric_cols] = scaler.fit_transform(all_leagues[numeric_cols])

print("Min-Max Normalization Applied.")

Min-Max Normalization Applied.


In [None]:
# Identify numerical columns
numeric_cols = players_df.select_dtypes(include=['int64', 'float64']).columns
from sklearn.preprocessing import MinMaxScaler

# Apply Min-Max Scaling
scaler = MinMaxScaler()
players_df[numeric_cols] = scaler.fit_transform(players_df[numeric_cols])

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Replace 'id' with 'ranking' in the dataset
all_leagues = all_leagues.rename(columns={'id': 'ranking'})

# Modeling


 📊 Club Performance Analyzer with Stakeholder Recommendations
 This model scores each club across key performance areas
 and provides advice on which stakeholder groups need the most improvement.


In [None]:
# 🎯 1. Define feature mapping for each position
position_features = {
    "GK": ["goals_conceded", "clean_sheets"],
    "CB": ["conceded_attacks_middle"],
    "RB": ["conceded_attacks_left"],
    "LB": ["conceded_attacks_right"],
    "DMF": ["conceded_attacks_middle", "possession(%)"],
    "CM": ["pass_accuracy (%)", "possession(%)"],
    "AMF": ["shots_per_match", "goals_scored"],
    "RW": ["total_attacks_left"],
    "LW": ["total_attacks_right"],
    "ST": ["goals_scored"],
    "Coach": ["wins", "draws", "losses", "goal_difference", "ranking"],
    "Medical Staff": ["total_injuries"]
    }

# 📦 2. Copy the normalized dataset
club_df = all_leagues.copy()


# 🧠 4. Score each position (average of normalized features × 100)
for position, features in position_features.items():
    club_df[f"{position}_Score"] = club_df[features].mean(axis=1) * 100

# 🤖 5. Recommendation logic – get 2 lowest scoring positions
def recommend_positions(row):
    scores = {pos: row[f"{pos}_Score"] for pos in position_features}
    weakest = sorted(scores.items(), key=lambda x: x[1])[:2]
    return f"⚠️ {row['club']} should improve " + " and ".join(pos for pos, _ in weakest)

club_df["Recommendations"] = club_df.apply(recommend_positions, axis=1)

# 📊 6. Display results
score_cols = [f"{pos}_Score" for pos in position_features]
result_df = club_df[["club"] + score_cols + ["Recommendations"]].sort_values(by="club")
result_df.head()

Unnamed: 0,club,GK_Score,CB_Score,RB_Score,LB_Score,DMF_Score,CM_Score,AMF_Score,RW_Score,LW_Score,ST_Score,Coach_Score,Medical Staff_Score,Recommendations
99,AC Milan,40.852459,85.714286,84.615385,84.615385,61.607143,50.677711,59.230382,70.0,72.727273,57.746479,45.897089,52.380952,⚠️ AC Milan should improve GK and Coach
101,AS Roma,42.754098,71.428571,69.230769,69.230769,54.464286,47.063253,42.823608,60.0,63.636364,38.028169,45.436235,38.095238,⚠️ AS Roma should improve ST and Medical Staff
21,AZ Alkmaar,36.295082,78.571429,76.923077,76.923077,58.035714,53.990964,65.618712,85.0,86.363636,63.380282,43.168151,52.380952,⚠️ AZ Alkmaar should improve GK and Coach
73,Ajaccio,50.262295,14.285714,15.384615,15.384615,10.642857,9.825301,5.952381,10.0,9.090909,0.0,42.889609,14.285714,⚠️ Ajaccio should improve ST and AMF
20,Ajax,36.754098,92.857143,92.307692,92.307692,65.178571,64.533133,93.770959,100.0,100.0,88.732394,45.691527,57.142857,⚠️ Ajax should improve GK and Coach


Create Model


In [None]:
import pandas as pd
import joblib

class PositionRecommender:
    def __init__(self):
        self.position_features = {
            "GK": ["goals_conceded", "clean_sheets"],
            "CB": ["conceded_attacks_middle"],
            "RB": ["conceded_attacks_left"],
            "LB": ["conceded_attacks_right"],
            "DMF": ["conceded_attacks_middle", "possession(%)"],
            "CM": ["pass_accuracy (%)", "possession(%)"],
            "AMF": ["shots_per_match", "goals_scored"],
            "RW": ["total_attacks_left"],
            "LW": ["total_attacks_right"],
            "ST": ["goals_scored"],
            "Coach": ["wins", "draws", "losses", "goal_difference", "ranking"],
            "Medical Staff": ["total_injuries"],
        }

    def score_positions(self, df: pd.DataFrame):
        df = df.copy()
        for position, features in self.position_features.items():
            df[f"{position}_Score"] = df[features].mean(axis=1) * 100
        return df

    def weakest_positions(self, df: pd.DataFrame):
        df = self.score_positions(df)

        def get_weakest(row):
            # Get the position scores and sort them
            scores = {pos: row[f"{pos}_Score"] for pos in self.position_features}
            # Get the top 5 weakest positions based on the lowest scores
            weakest = sorted(scores.items(), key=lambda x: x[1])[:5]
            return pd.Series({
                "club": row["club"],
                "Weakest_1": weakest[0][0],
                "Weakest_2": weakest[1][0],
                "Weakest_3": weakest[2][0],
                "Weakest_4": weakest[3][0],
                "Weakest_5": weakest[4][0]
            })

        # Apply the function to get the weakest positions for each club
        return df.apply(get_weakest, axis=1)

# Example data (replace this with your actual data)
# all_leagues = pd.read_csv("your_data.csv")  # Load the data as needed

# Create and use the model
model = PositionRecommender()
weakest_df = model.weakest_positions(all_leagues)

# Save the recommendations to CSV
weakest_df.to_csv("club_weakest_positions.csv", index=False)
print("📁 Weakest positions saved to 'club_weakest_positions.csv'")

# Save the model
joblib.dump(model, "position_recommender.pkl")
print("✅ Model saved to 'position_recommender.pkl'")


📁 Weakest positions saved to 'club_weakest_positions.csv'
✅ Model saved to 'position_recommender.pkl'


In [None]:
# Load and use the model
import joblib
recommender = joblib.load("position_recommender.pkl")

# Predict on all_leagues
result = recommender.weakest_positions(all_leagues)
result.head()


Unnamed: 0,club,Weakest_1,Weakest_2,Weakest_3,Weakest_4,Weakest_5
0,Bayern Munich,GK,Medical Staff,Coach,DMF,CM
1,Borussia Dortmund,Coach,GK,Medical Staff,CM,DMF
2,RB Leipzig,Coach,GK,Medical Staff,CM,ST
3,Union Berlin,CM,Medical Staff,ST,Coach,GK
4,SC Freiburg,ST,Coach,GK,Medical Staff,AMF
