Importing the Required Packages

In [1]:
import numpy as np
import pandas as pd

Loading the dataset into the notebooks

In [2]:
# Loading the dataset
dataset = pd.read_csv("Data/Football_teams_price_data.csv")

# Printing the top 5 rows of the dataset
dataset.head()

Unnamed: 0,AveragePlayerAge,TotalGoalsLastSeason,MatchesWonLastSeason,MatchesDrawnLastSeason,MatchesLostLastSeason,TotalGoalsConcededLastSeason,TotalRevenueLastSeason,StadiumCapacity,AverageAttendance,TransferSpendingLastSeason,...,Country,Manager,TeamFormation,PlayingStyle,HomeCity,StadiumType,MainSponsor,KitManufacturer,OwnershipType,Price
0,24.367182,28,16,36,20,51,337.552001,55766,5229.74272,177.592126,...,England,Manager B,4-2-3-1,Counter-Attack,City C,Multipurpose,Sponsor B,Manufacturer B,Private,4375181628
1,34.162143,89,12,35,31,84,264.850402,15307,46172.738882,125.441952,...,Spain,Manager C,3-4-3,Defensive,City C,Multipurpose,Sponsor A,Manufacturer B,Consortium,859750388
2,30.443897,43,3,9,28,72,363.423205,77050,78334.254989,43.776137,...,Spain,Manager C,3-5-2,Attacking,City D,Classic,Sponsor A,Manufacturer C,Consortium,2168913729
3,28.177194,84,19,19,34,22,296.719959,61440,60080.423819,183.936395,...,Italy,Manager C,4-3-3,Counter-Attack,City C,Classic,Sponsor D,Manufacturer B,Consortium,1426481894
4,20.652317,36,36,10,9,80,480.928532,78563,10940.950513,14.881518,...,France,Manager D,4-2-3-1,Counter-Attack,City D,Historical,Sponsor C,Manufacturer B,Consortium,4038110631


# Data Cleaning and Preprocessing

1. Identification and Handling of Missing values

In [3]:
# Number of missing values in each column
for col in dataset.columns:
    print(f"Number of missing values in the column {col} is {dataset[col].isnull().sum()}")

Number of missing values in the column AveragePlayerAge is 0
Number of missing values in the column TotalGoalsLastSeason is 0
Number of missing values in the column MatchesWonLastSeason is 0
Number of missing values in the column MatchesDrawnLastSeason is 0
Number of missing values in the column MatchesLostLastSeason is 0
Number of missing values in the column TotalGoalsConcededLastSeason is 0
Number of missing values in the column TotalRevenueLastSeason is 0
Number of missing values in the column StadiumCapacity is 0
Number of missing values in the column AverageAttendance is 0
Number of missing values in the column TransferSpendingLastSeason is 0
Number of missing values in the column TransferIncomeLastSeason is 0
Number of missing values in the column NumberOfTrophies is 0
Number of missing values in the column MarketValueOfSquad is 0
Number of missing values in the column AveragePlayerMarketValue is 0
Number of missing values in the column YouthAcademyRating is 0
Number of missing 

There are no missing value, So there's no need to do any more thing.

2. Error Correction and Inconsistencies

In [4]:
print(f"Name of the columns: {dataset.columns.values}")

Name of the columns: ['AveragePlayerAge' 'TotalGoalsLastSeason' 'MatchesWonLastSeason'
 'MatchesDrawnLastSeason' 'MatchesLostLastSeason'
 'TotalGoalsConcededLastSeason' 'TotalRevenueLastSeason' 'StadiumCapacity'
 'AverageAttendance' 'TransferSpendingLastSeason'
 'TransferIncomeLastSeason' 'NumberOfTrophies' 'MarketValueOfSquad'
 'AveragePlayerMarketValue' 'YouthAcademyRating' 'League' 'Country'
 'Manager' 'TeamFormation' 'PlayingStyle' 'HomeCity' 'StadiumType'
 'MainSponsor' 'KitManufacturer' 'OwnershipType' 'Price']


It's hard to read the names of the columns, thus we will be changing the names to make them more readable.

In [5]:
dataset.rename(columns = {
    "AveragePlayerAge": "Average_Player_Age",
    "TotalGoalsLastSeason": "Total_Goals_Last_Season",
    "MatchesWonLastSeason": "Matches_Won_Last_Season",
    "MatchesDrawnLastSeason": "Matches_Drawn_Last_Season",
    "MatchesLostLastSeason": "Matches_Lost_Last_Season",
    "TotalGoalsConcededLastSeason": "Total_Goals_Conceded_Last_Season",
    "TotalRevenueLastSeason": "Total_Revenue_Last_Season",
    "StadiumCapacity": "Stadium_Capacity",
    "AverageAttendance": "Average_Attendance",
    "TransferSpendingLastSeason": "Transfer_Spending_Last_Season",
    "TransferIncomeLastSeason": "Transfer_Income_Last_Season",
    "NumberOfTrophies": "Number_Of_Trophies",
    "MarketValueOfSquad": "Market_Value_Of_Squad",
    "AveragePlayerMarketValue": "Average_Player_Market_Value",
    "YouthAcademyRating": "Youth_Academy_Rating",
    "TeamFormation": "Team_Formation",
    "PlayingStyle": "Playing_Style",
    "HomeCity": "Home_City",
    "StadiumType": "Stadium_Type",
    "MainSponsor": "Main_Sponsor",
    "KitManufacturer": "Kit_Manufacturer",
    "OwnershipType": "Ownership_Type"
}, inplace = True)

In [6]:
print(f"Name of the columns: {dataset.columns.values}")

Name of the columns: ['Average_Player_Age' 'Total_Goals_Last_Season' 'Matches_Won_Last_Season'
 'Matches_Drawn_Last_Season' 'Matches_Lost_Last_Season'
 'Total_Goals_Conceded_Last_Season' 'Total_Revenue_Last_Season'
 'Stadium_Capacity' 'Average_Attendance' 'Transfer_Spending_Last_Season'
 'Transfer_Income_Last_Season' 'Number_Of_Trophies'
 'Market_Value_Of_Squad' 'Average_Player_Market_Value'
 'Youth_Academy_Rating' 'League' 'Country' 'Manager' 'Team_Formation'
 'Playing_Style' 'Home_City' 'Stadium_Type' 'Main_Sponsor'
 'Kit_Manufacturer' 'Ownership_Type' 'Price']


In [7]:
dataset.describe()

Unnamed: 0,Average_Player_Age,Total_Goals_Last_Season,Matches_Won_Last_Season,Matches_Drawn_Last_Season,Matches_Lost_Last_Season,Total_Goals_Conceded_Last_Season,Total_Revenue_Last_Season,Stadium_Capacity,Average_Attendance,Transfer_Spending_Last_Season,Transfer_Income_Last_Season,Number_Of_Trophies,Market_Value_Of_Squad,Average_Player_Market_Value,Youth_Academy_Rating
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,26.480678,59.34316,18.42546,18.60446,18.4781,59.53868,274.506391,54854.56788,47425.31115,99.845519,100.406919,24.58464,505.104996,25.249237,50.000278
std,4.902909,23.095732,10.955441,10.980261,10.996911,23.049783,130.150495,25986.349661,24578.631962,57.666798,57.609852,14.426361,286.4754,14.29822,28.827688
min,18.000094,20.0,0.0,0.0,0.0,20.0,50.004035,10013.0,5003.395786,0.006229,0.004302,0.0,10.005993,0.500343,0.000318
25%,22.231468,39.0,9.0,9.0,9.0,40.0,161.259631,32312.25,26181.373046,50.196498,50.355226,12.0,257.682086,12.808254,25.10209
50%,26.481162,59.0,18.0,19.0,18.0,60.0,274.402274,54814.0,47449.443426,99.927641,101.042932,25.0,503.947159,25.293575,50.08063
75%,30.721124,79.0,28.0,28.0,28.0,79.0,387.154302,77337.25,68720.697593,149.525266,150.070587,37.0,752.608455,37.612826,74.788126
max,34.999527,99.0,37.0,37.0,37.0,99.0,499.999691,99997.0,89998.493713,199.997388,199.999663,49.0,999.968039,49.99969,99.997146


The Mean and Median of all the Numerical columns are almost identical, it means:
- Symmetry in Data Distribution: It indicates that the data is likely symmetrically distributed around the central value.
- Lack of Skewness: it suggests that the distribution has little to no skew.
- Robustness to Outliers: It implies that there are no extreme outliers.