In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [None]:

df = pd.read_csv('CLEAN_FIFA21_official_data.csv')


In [None]:

# Display basic information about the dataset
df.head()  # First few rows
df.info()  # Info on data types and non-null counts


In [None]:

# Check for missing values and duplicates
df.isnull().sum()  # Check for null values
df.duplicated().sum()  # Check for duplicate rows


In [None]:

# Convert 'Height(cm.)' to numeric (in case of incorrect formats)
df['Height(cm.)'] = df['Height(cm.)'].astype(float)


In [None]:

# Convert 'Weight(lbs.)' to numeric (in case of incorrect formats)
df['Weight(lbs.)'] = df['Weight(lbs.)'].astype(float)


In [None]:

# Extract Year, Month, and Day from the 'Joined' column
df[['Year', 'Month', 'Day']] = df['Joined'].str.split('-', expand=True)
df[['Year', 'Month', 'Day']] = df[['Year', 'Month', 'Day']].astype(int)

# Clean currency columns (remove symbols and convert to numeric)
def clean_currency(column):
    return df[column].replace({'€': '', 'M': 'e6', 'K': 'e3'}, regex=True).astype(float)


In [None]:

# Apply the currency cleaning function to relevant columns
df['Value(£)'] = clean_currency('Value(£)')
df['Wage(£)'] = clean_currency('Wage(£)')
df['Release Clause(£)'] = clean_currency('Release Clause(£)')


In [None]:

# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_fifa21_data.csv', index=False)


In [None]:

# Basic statistics for numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()


In [None]:

# Plot heatmap of correlations between numerical features
plt.figure(figsize=(13, 9))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Player Attributes')
plt.grid(True)
plt.show()


In [None]:

# Get the top player for each position (based on 'Overall' rating)
best_players = df.groupby('Best Position').apply(lambda x: x.nlargest(1, 'Overall'))
print(best_players[['Name', 'Best Position', 'Overall']])


In [None]:

# Scatter plot for Age vs Overall Rating
sns.scatterplot(data=df, x='Age', y='Overall', alpha=0.6)
plt.title('Age vs Overall Rating')
plt.xlabel('Age')
plt.ylabel('Overall Rating')
plt.show()


In [None]:

# Scatter plot for Age vs Potential Rating
sns.scatterplot(data=df, x='Age', y='Potential', alpha=0.6, color='skyblue')
plt.title('Age vs Potential Rating')
plt.xlabel('Age')
plt.ylabel('Potential Rating')
plt.show()


In [None]:

# Create a new column for Potential Gap (Potential - Overall)
df['Potential Gap'] = df['Potential'] - df['Overall']


In [None]:

# Sort players by Potential Gap to identify undervalued players
underrated_players = df.sort_values(by='Potential Gap', ascending=False).head(20)
print(underrated_players[['Name', 'Overall', 'Potential', 'Value(£)', 'Potential Gap', 'Nationality']])


In [None]:

# Scatter plot for Overall Rating vs Market Value
sns.scatterplot(data=df, x='Overall', y='Value(£)', alpha=0.6)
plt.title('Overall Rating vs Market Value (£)')
plt.xlabel('Overall Rating')
plt.ylabel('Market Value (£)')
plt.grid(True)
plt.show()


In [None]:

# Top 10 Nationalities by player count
top_Nationalities = df['Nationality'].value_counts().head(10)
top_Nationalities.plot(kind='bar', figsize=(12, 7))
plt.title('Top 10 Nationalities by Player Count')
plt.xlabel('Nationalities')
plt.ylabel('Number of Players')
plt.show()


In [None]:

# Least 10 Nationalities by player count
Least_Nationalities = df['Nationality'].value_counts().tail(10)
Least_Nationalities.plot(kind='bar', figsize=(12, 7))
plt.title('Bottom 10 Nationalities by Player Count')
plt.xlabel('Nationalities')
plt.ylabel('Number of Players')
plt.show()


In [None]:

# Compute average rating by club (excluding the first row)
club_ratings = df.groupby('Club')['Overall'].mean().sort_values(ascending=False).head(11)
club_ratings_without_first = club_ratings.iloc[1:]
print(club_ratings_without_first)


In [None]:

# Scatter plot for Wage vs Overall Rating
sns.scatterplot(data=df, x='Wage(£)', y='Overall', alpha=0.6)
plt.title('Wage vs Overall Rating')
plt.xlabel('Wage (£)')
plt.ylabel('Overall Rating')
plt.grid(True)
plt.show()


In [None]:

# Prepare data for predictive modeling
X = df[['Overall', 'Potential', 'Age']]  # Features
y = df['Value(£)']  # Target variable


In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:

# Predict and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')


In [None]:

# Filter for players under 21 years old and sort by Potential Gap
young_players = df[df['Age'] < 21]
young_high_potential = young_players.sort_values(by='Potential Gap', ascending=False).head(10)
print(young_high_potential[['Name', 'Age', 'Overall', 'Potential', 'Potential Gap']])


In [None]:

# Log-transform the target variable for better predictive performance
y_log = np.log1p(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)


In [None]:

# Train the model with log-transformed target
model.fit(X_train, y_train_log)
y_pred_log = model.predict(X_test)
mse_log = mean_squared_error(y_test_log, y_pred_log)
print(f'New Mean Squared Error (Log Transformed): {mse_log:.2f}')


In [None]:

# Feature importance (coefficients)
importances = model.coef_  
feature_names = X.columns
for name, coef in zip(feature_names, importances):
    print(f'{name}: {coef:.2f}')


In [None]:

# Scatter plot of predicted vs actual log-transformed values
plt.scatter(y_test_log, y_pred_log, alpha=0.6)
plt.title('Predicted vs Actual (Log-Transformed)')
plt.xlabel('Actual Log Market Value')
plt.ylabel('Predicted Log Market Value')
plt.show()


In [None]:

# Visualize feature importance
feature_impact = {
    'Feature': ['Overall', 'Potential', 'Age', 'Wage(£)', 'International Reputation'],
    'Coefficient': [2196600.43, -228080.04, -1477886.27, 3343651.86, 300469.64]
}
sns.barplot(x='Coefficient', y='Feature', data=feature_impact, palette='coolwarm')
plt.title('Feature Coefficients Impact on Market Value')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.show()
