## Minor project 
## Kick Start Analytics – Unveiling Patterns and Potentials in Football Player’s Data


In [1]:
# importing libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Reading the dataset 
df = pd.read_csv("fifa_cleaned (1).csv")

In [3]:
df.head()

Unnamed: 0,id,name,full_name,birth_date,age,height_cm,weight_kgs,positions,nationality,overall_rating,...,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB
0,158023,L. Messi,Lionel Andrés Messi Cuccittini,1987-06-24,31,170.18,72.1,"CF,RW,ST",Argentina,94,...,64+2,61+2,61+2,61+2,64+2,59+2,48+2,48+2,48+2,59+2
1,190460,C. Eriksen,Christian Dannemann Eriksen,1992-02-14,27,154.94,76.2,"CAM,RM,CM",Denmark,88,...,71+3,71+3,71+3,71+3,71+3,66+3,57+3,57+3,57+3,66+3
2,195864,P. Pogba,Paul Pogba,1993-03-15,25,190.5,83.9,"CM,CAM",France,88,...,76+3,77+3,77+3,77+3,76+3,74+3,72+3,72+3,72+3,74+3
3,198219,L. Insigne,Lorenzo Insigne,1991-06-04,27,162.56,59.0,"LW,ST",Italy,88,...,63+3,58+3,58+3,58+3,63+3,58+3,44+3,44+3,44+3,58+3
4,201024,K. Koulibaly,Kalidou Koulibaly,1991-06-20,27,187.96,88.9,CB,Senegal,88,...,73+3,77+3,77+3,77+3,73+3,76+3,85+3,85+3,85+3,76+3


# DATA CLEANING

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
df.isnull().sum()

id                                   0
name                                 0
full_name                            0
birth_date                           0
age                                  0
height_cm                            0
weight_kgs                           0
positions                            0
nationality                          0
overall_rating                       0
potential                            0
value_euro                         255
wage_euro                          246
preferred_foot                       0
international_reputation(1-5)        0
weak_foot(1-5)                       0
skill_moves(1-5)                     0
work_rate                            0
body_type                            0
release_clause_euro               1837
club_team                           14
club_rating                         14
club_position                       14
club_jersey_number                  14
club_join_date                    1936
contract_end_year        

In [6]:
df.shape

(17954, 92)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17954 entries, 0 to 17953
Data columns (total 92 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             17954 non-null  int64  
 1   name                           17954 non-null  object 
 2   full_name                      17954 non-null  object 
 3   birth_date                     17954 non-null  object 
 4   age                            17954 non-null  int64  
 5   height_cm                      17954 non-null  float64
 6   weight_kgs                     17954 non-null  float64
 7   positions                      17954 non-null  object 
 8   nationality                    17954 non-null  object 
 9   overall_rating                 17954 non-null  int64  
 10  potential                      17954 non-null  int64  
 11  value_euro                     17699 non-null  float64
 12  wage_euro                      17708 non-null 

In [8]:
# columns to drop
columns_to_drop = ['tags']  # Initialize with 'tags' column
threshold = 10000

# Iterate through columns and add those with more than threshold null values to the list
for column in df.columns:
    if df[column].isnull().sum() > threshold:
        columns_to_drop.append(column)

# Drop the columns from the DataFrame
df_threshold=df
df = df.drop(columns=columns_to_drop)


In [9]:
df = df.drop('traits',axis=1)

In [10]:
columns_to_check = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 
                    'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 
                    'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']

# Drop rows columns have null values
df = df.dropna(subset=columns_to_check)


In [11]:
import pandas as pd

def clean_dataset(df):
    
    # Convert birth_date to datetime
    df['birth_date'] = pd.to_datetime(df['birth_date'])
    
    # Convert club_join_date to datetime
    df['club_join_date'] = pd.to_datetime(df['club_join_date'])
    
    # Convert 'work_rate' to two separate columns 'work_rate_attack' and 'work_rate_defense'
    df[['work_rate_attack', 'work_rate_defense']] = df['work_rate'].str.split('/', expand=True)
    df.drop(columns=['work_rate'], inplace=True)
    
    # Drop rows with missing values in specific columns
    df.dropna(subset=['release_clause_euro', 'club_rating'], inplace=True)
    
    # Convert 'club_join_date' to year
    df['club_join_year'] = df['club_join_date'].dt.year
    df.drop(columns=['club_join_date'], inplace=True)
    
    # Reset index
    df.reset_index(drop=True, inplace=True)
    
    return df


# Clean the dataset
df = clean_dataset(df)

# Display the cleaned dataset
print(df.head())


       id          name                       full_name birth_date  age  \
0  158023      L. Messi  Lionel Andrés Messi Cuccittini 1987-06-24   31   
1  190460    C. Eriksen    Christian  Dannemann Eriksen 1992-02-14   27   
2  195864      P. Pogba                      Paul Pogba 1993-03-15   25   
3  198219    L. Insigne                 Lorenzo Insigne 1991-06-04   27   
4  201024  K. Koulibaly               Kalidou Koulibaly 1991-06-20   27   

   height_cm  weight_kgs  positions nationality  overall_rating  potential  \
0     170.18        72.1   CF,RW,ST   Argentina              94         94   
1     154.94        76.2  CAM,RM,CM     Denmark              88         89   
2     190.50        83.9     CM,CAM      France              88         91   
3     162.56        59.0      LW,ST       Italy              88         88   
4     187.96        88.9         CB     Senegal              88         91   

    value_euro  wage_euro preferred_foot  international_reputation(1-5)  \
0  11

In [12]:
df = df.drop('club_join_year',axis=1)
df = df.drop('contract_end_year',axis=1)

In [13]:
# work_rate_defense into separate columns
def split_work_rate_defense(df):
    work_rate_defense = df['work_rate_defense'].str.get_dummies(sep='/')
    df = pd.concat([df, work_rate_defense], axis=1)
    df.drop(columns=['work_rate_defense'], inplace=True)
    return df
df = split_work_rate_defense(df)

In [14]:
import pandas as pd
# Split work_rate_attack into separate columns
def split_work_rate_attack(df):
    work_rate_attack = df['work_rate_attack'].str.get_dummies(sep='/')
    
    # Add the new columns to the DataFrame
    df = pd.concat([df, work_rate_attack], axis=1)
    
    
    # Drop the original work_rate_attack column
    df.drop(columns=['work_rate_attack'], inplace=True)
    return df

# Call the function to split the work_rate_attack column
df = split_work_rate_attack(df)

In [15]:
# Dummy variables

In [16]:
# club_position
def create_dummy_club_position(df):
    # club_position
    club_position_dummies = pd.get_dummies(df['club_position'], prefix='club_position')
    df = pd.concat([df, club_position_dummies], axis=1)
    df.drop(columns=['club_position'], inplace=True)
    return df
df = create_dummy_club_position(df)

In [17]:
# preferred_foot
def create_dummy_preferred_foot(df):
    preferred_foot_dummies = pd.get_dummies(df['preferred_foot'], prefix='preferred_foot', drop_first=True)
    df = pd.concat([df, preferred_foot_dummies], axis=1)
    df.drop(columns=['preferred_foot'], inplace=True)
    return df
df = create_dummy_preferred_foot(df)

In [18]:
# body_type
def create_dummy_body_type(df):
    body_type_dummies = pd.get_dummies(df['body_type'], prefix='body_type')
    df = pd.concat([df, body_type_dummies], axis=1)
    df.drop(columns=['body_type'], inplace=True)
    return df
df = create_dummy_body_type(df)

# MODEL 
Dynamic Player Rating Prediction

In [19]:
df.head()

Unnamed: 0,id,name,full_name,birth_date,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,value_euro,wage_euro,international_reputation(1-5),weak_foot(1-5),skill_moves(1-5),release_clause_euro,club_team,club_rating,club_jersey_number,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,freekick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,GK_diving,GK_handling,GK_kicking,GK_positioning,GK_reflexes,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,LCM,CM,RCM,RM,LWB,LDM,CDM,RDM,RWB,LB,LCB,CB,RCB,RB,High,Low,Medium,High.1,Low.1,Medium.1,club_position_CAM,club_position_CB,club_position_CDM,club_position_CF,club_position_CM,club_position_LAM,club_position_LB,club_position_LCB,club_position_LCM,club_position_LDM,club_position_LF,club_position_LM,club_position_LS,club_position_LW,club_position_LWB,club_position_RAM,club_position_RB,club_position_RCB,club_position_RCM,club_position_RDM,club_position_RES,club_position_RF,club_position_RM,club_position_RS,club_position_RW,club_position_RWB,club_position_ST,club_position_SUB,preferred_foot_Right,body_type_Akinfenwa,body_type_C. Ronaldo,body_type_Lean,body_type_Messi,body_type_Neymar,body_type_Normal,body_type_PLAYER_BODY_TYPE_25,body_type_Shaqiri,body_type_Stocky
0,158023,L. Messi,Lionel Andrés Messi Cuccittini,1987-06-24,31,170.18,72.1,"CF,RW,ST",Argentina,94,94,110500000.0,565000.0,5,4,4,226500000.0,FC Barcelona,86.0,10.0,86,95,70,92,86,97,93,94,89,96,91,86,93,95,95,85,68,72,66,94,48,22,94,94,75,96,33,28,26,6,11,15,14,8,89+2,89+2,89+2,93+2,93+2,93+2,93+2,93+2,93+2,93+2,93+2,91+2,85+2,85+2,85+2,91+2,64+2,61+2,61+2,61+2,64+2,59+2,48+2,48+2,48+2,59+2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,190460,C. Eriksen,Christian Dannemann Eriksen,1992-02-14,27,154.94,76.2,"CAM,RM,CM",Denmark,88,89,69500000.0,205000.0,3,5,4,133800000.0,Tottenham Hotspur,83.0,23.0,88,81,52,91,80,84,86,87,89,91,76,73,80,88,81,84,50,92,58,89,46,56,84,91,67,88,59,57,22,9,14,7,7,6,79+3,79+3,79+3,85+3,84+3,84+3,84+3,85+3,86+3,86+3,86+3,86+3,85+3,85+3,85+3,86+3,71+3,71+3,71+3,71+3,71+3,66+3,57+3,57+3,57+3,66+3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,195864,P. Pogba,Paul Pogba,1993-03-15,25,190.5,83.9,"CM,CAM",France,88,91,73000000.0,255000.0,4,4,5,144200000.0,Manchester United,82.0,6.0,80,75,75,86,85,87,85,82,90,90,71,79,76,82,66,90,83,88,87,82,78,64,82,88,82,87,63,67,67,5,6,2,4,3,81+3,81+3,81+3,82+3,83+3,83+3,83+3,82+3,84+3,84+3,84+3,83+3,84+3,84+3,84+3,83+3,76+3,77+3,77+3,77+3,76+3,74+3,72+3,72+3,72+3,74+3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,198219,L. Insigne,Lorenzo Insigne,1991-06-04,27,162.56,59.0,"LW,ST",Italy,88,88,62000000.0,165000.0,3,4,4,105400000.0,Napoli,82.0,24.0,86,77,56,85,74,90,87,77,78,93,94,86,94,83,93,75,53,75,44,84,34,26,83,87,61,83,51,24,22,8,4,14,9,10,78+3,78+3,78+3,86+3,85+3,85+3,85+3,86+3,86+3,86+3,86+3,86+3,78+3,78+3,78+3,86+3,63+3,58+3,58+3,58+3,63+3,58+3,44+3,44+3,44+3,58+3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,201024,K. Koulibaly,Kalidou Koulibaly,1991-06-20,27,187.96,88.9,CB,Senegal,88,91,60000000.0,135000.0,3,3,2,106500000.0,Napoli,82.0,26.0,30,22,83,68,14,69,28,28,60,63,70,75,50,82,40,55,81,75,94,15,87,88,24,49,33,80,91,88,87,7,11,7,13,5,53+3,53+3,53+3,53+3,54+3,54+3,54+3,53+3,55+3,55+3,55+3,57+3,61+3,61+3,61+3,57+3,73+3,77+3,77+3,77+3,73+3,76+3,85+3,85+3,85+3,76+3,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib  # Import joblib for saving the model


# Define features and target
features = ['age', 'potential', 'height_cm', 'weight_kgs', 
            'crossing', 'finishing', 'dribbling',  
            'club_rating']
target = 'overall_rating'  

# Drop rows with missing values in features and target
df.dropna(subset=features + [target], inplace=True)

# Split data into features (X) and target (y)
X = df[features]
y = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 6.142878614679018


In [21]:
# For Accurcy R-squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R-squared (Coefficient of Determination):", r2)


R-squared (Coefficient of Determination): 0.8712114122126104


In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

r2 = r2_score(y_test, y_pred)
print("R-squared (Coefficient of Determination):", r2)

Mean Squared Error (MSE): 6.142878614679018
Root Mean Squared Error (RMSE): 2.478483127777758
Mean Absolute Error (MAE): 1.9635986885021348
R-squared (Coefficient of Determination): 0.8712114122126104


In [23]:
# Save the trained model to a .sav file using joblib
model_filename = 'linear_regression_model.sav'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")

Model saved to linear_regression_model.sav


In [24]:
# Function to predict future overall rating for a given player
def predict_player_rating(player_name):
    # Find the player data based on the player name
    player_data = df[df['name'] == player_name]
    
    if player_data.empty:
        print(f"No data found for player: {player_name}")
        return None
    
    # Extract the features for the player
    player_features = player_data[features].values
    
    # Predict the overall rating using the trained model
    predicted_rating = model.predict(player_features)
    
    # Return the predicted overall rating
    return predicted_rating[0]

# Example usage
player_name = input("Enter the player name: ")
predicted_rating = predict_player_rating(player_name)
if predicted_rating is not None:
    print(f"The predicted future performance rating for {player_name} is: {predicted_rating}")

Enter the player name: L. Messi
The predicted future performance rating for L. Messi is: 93.28394702090128


In [25]:
for column in X:
    print(column)

age
potential
height_cm
weight_kgs
crossing
finishing
dribbling
club_rating
