In [2]:
# FIFA Dataset - EDA Visualization
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib
import kaleido




In [3]:

# Load data
df = pd.read_csv("../data/fifa_players.csv")
print(f"Loaded: {df.shape[0]} players, {df.shape[1]} features")
# Load results
model_artifacts = joblib.load('../models/fifa_model_complete.pkl')
training_results = joblib.load('../models/training_results.pkl')

mean_importance = model_artifacts['feature_importance']
position_scores = training_results['position_scores']

val_hamming = training_results['val_metrics']['hamming']
val_accuracy = training_results['val_metrics']['accuracy']
val_f1_macro = training_results['val_metrics']['f1_macro']
val_f1_micro = training_results['val_metrics']['f1_micro']

test_hamming = training_results['test_metrics']['hamming']
test_accuracy = training_results['test_metrics']['accuracy']
test_f1_macro = training_results['test_metrics']['f1_macro']
test_f1_micro = training_results['test_metrics']['f1_micro']

  df = pd.read_csv("../data/fifa_players.csv")


Loaded: 19239 players, 80 features


In [4]:
# 1. AGE DISTRIBUTION
fig1 = px.histogram(
    df,
    x='age',
    nbins=30,
    title='Player Age Distribution',
    labels={'age': 'Age', 'count': 'Number of Players'},
    color_discrete_sequence=['#3498db']
)
fig1.update_layout(showlegend=False)
fig1.show()

In [5]:
# 2. OVERALL RATING VS POTENTIAL
fig2 = px.scatter(
    df,
    x='overall',
    y='potential',
    title='Overall Rating vs Potential',
    labels={'overall': 'Current Rating', 'potential': 'Potential'},
    opacity=0.5,
    color_discrete_sequence=['#e74c3c']
)
fig2.add_trace(
    go.Scatter(
        x=[df['overall'].min(), df['overall'].max()],
        y=[df['overall'].min(), df['overall'].max()],
        mode='lines',
        name='y=x',
        line=dict(color='gray', dash='dash')
    )
)
fig2.show()

In [6]:
# 3. TOP 10 COUNTRIES BY PLAYER COUNT
top_countries = df['nationality_name'].value_counts().head(10)
fig3 = px.bar(
    x=top_countries.index,
    y=top_countries.values,
    title='Top 10 Countries by Player Count',
    labels={'x': 'Country', 'y': 'Number of Players'},
    color=top_countries.values,
    color_continuous_scale='Viridis'
)
fig3.update_layout(showlegend=False, xaxis_tickangle=-45)
fig3.show()

In [7]:
# 4. POSITION DISTRIBUTION

# Split positions (some players have multiple positions)
all_positions = df['player_positions'].str.split(', ').explode()
position_counts = all_positions.value_counts().head(15)

fig4 = px.bar(
    x=position_counts.index,
    y=position_counts.values,
    title='Player Position Distribution (Top 15)',
    labels={'x': 'Position', 'y': 'Number of Players'},
    color=position_counts.values,
    color_continuous_scale='Blues'
)
fig4.update_layout(showlegend=False)
fig4.show()


In [8]:
# 5. WAGE VS PLAYER VALUE

# Filter players with known data
df_clean = df.dropna(subset=['wage_eur', 'value_eur'])
df_clean = df_clean[(df_clean['wage_eur'] > 0) & (df_clean['value_eur'] > 0)]

fig5 = px.scatter(
    df_clean,
    x='value_eur',
    y='wage_eur',
    title='Wage vs Player Value',
    labels={'value_eur': 'Value (€)', 'wage_eur': 'Wage (€)'},
    log_x=True,
    log_y=True,
    opacity=0.4,
    color_discrete_sequence=['#2ecc71']
)
fig5.show()

In [9]:
# 6. MAIN SKILLS COMPARISON

skills = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']
skill_means = df[skills].mean()

fig6 = px.bar(
    x=skills,
    y=skill_means.values,
    title='Average Values of Main Skills',
    labels={'x': 'Skill', 'y': 'Average Value'},
    color=skill_means.values,
    color_continuous_scale='RdYlGn'
)
fig6.update_layout(showlegend=False)
fig6.show()

In [10]:
# 7. PREFERRED FOOT DISTRIBUTION

foot_counts = df['preferred_foot'].value_counts()
fig7 = px.pie(
    values=foot_counts.values,
    names=foot_counts.index,
    title='Preferred Foot Distribution',
    color_discrete_sequence=['#9b59b6', '#f39c12']
)
fig7.show()

In [11]:
fig8 = px.scatter(
    df,
    x='age',
    y='overall',
    title='Age vs Player Rating',
    labels={'age': 'Age', 'overall': 'Rating'},
    trendline='ols',  # Changed from 'lowess' to 'ols' (linear trend)
    opacity=0.3,
    color_discrete_sequence=['#e67e22']
)
fig8.show()

In [12]:
# 9. SKILL CORRELATION MATRIX

skills_extended = ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'overall']
corr_matrix = df[skills_extended].corr()

fig9 = px.imshow(
    corr_matrix,
    title='Skill Correlation Matrix',
    labels=dict(color='Correlation'),
    color_continuous_scale='RdBu_r',
    aspect='auto'
)
fig9.update_xaxes(side='bottom')
fig9.show()

In [13]:
# 10. TOP 20 FEATURE IMPORTANCE

top_20_features = mean_importance.head(20)

fig10 = go.Figure(go.Bar(
    x=top_20_features.values,
    y=top_20_features.index,
    orientation='h',
    marker=dict(color='steelblue')
))

fig10.update_layout(
    title='Top 20 Feature Importance',
    xaxis_title='Average Feature Importance',
    yaxis_title='Feature',
    height=600,
    yaxis=dict(autorange="reversed")
)
fig10.show()
fig10.write_image("../images/average_feature_importance.png", width=800, height=500)


In [14]:
# 11. MODEL PERFORMANCE: VALIDATION VS TEST
# =============================================================================
metrics = ['Hamming Loss', 'Accuracy', 'F1-Macro', 'F1-Micro']
val_values = [val_hamming, val_accuracy, val_f1_macro, val_f1_micro]
test_values = [test_hamming, test_accuracy, test_f1_macro, test_f1_micro]

fig11 = go.Figure()

fig11.add_trace(go.Bar(
    name='Validation',
    x=metrics,
    y=val_values,
    marker_color='lightblue',
    opacity=0.8
))

fig11.add_trace(go.Bar(
    name='Test',
    x=metrics,
    y=test_values,
    marker_color='lightcoral',
    opacity=0.8
))

fig11.update_layout(
    title='Model Performance: Validation vs Test',
    xaxis_title='Metrics',
    yaxis_title='Score',
    barmode='group',
    yaxis=dict(range=[0, 1]),
    height=500
)
fig11.show()

In [15]:
# 12. PER-POSITION TEST ACCURACY

positions = list(position_scores.keys())
accuracies = list(position_scores.values())

fig12 = go.Figure(go.Bar(
    x=positions,
    y=accuracies,
    marker=dict(color='lightgreen'),
    opacity=0.7
))

fig12.update_layout(
    title='Per-Position Test Accuracy',
    xaxis_title='Position',
    yaxis_title='Test Accuracy',
    yaxis=dict(range=[0, 1]),
    height=500
)
fig12.show()
fig12.write_image("../images/per_position_accuracy.png", width=800, height=500)