# Player Market Price Estimator — EDA, Training & SHAP
This notebook runs the full pipeline interactively: EDA → Feature engineering → Train (RF & XGB) → Evaluate → SHAP explainability.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import shap
pd.set_option('display.max_columns', None)
%matplotlib inline


In [2]:
df = pd.read_csv('data/processed/players_processed.csv')
print('Rows:', len(df))
df.head()


## Quick data summary

In [3]:
display(df.describe(include='all'))


## Market value distribution (original and log)

In [4]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df['market_value_num'], bins=40)
plt.title('market_value (numeric)')
plt.subplot(1,2,2)
sns.histplot(df['target'], bins=40)
plt.title('log1p(market_value) target')
plt.show()


## Correlations

In [5]:
plt.figure(figsize=(8,6))
sns.heatmap(df[['age','goals_per_90','assists_per_90','contract_years_left','market_value_num']].corr(), annot=True)
plt.show()


## Feature engineering (age^2, is_young)

In [6]:
df['age_sq'] = df['age']**2
df['is_young'] = (df['age'] < 23).astype(int)
df[['age','age_sq','is_young']].head()


## Train/Test split and feature set

In [7]:
features = ['age','age_sq','goals_per_90','assists_per_90','contract_years_left','club_rank_pct']
cat = ['pos_bucket','nationality']
X = df[features + cat]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train:', len(X_train), 'Test:', len(X_test))


## Preprocessing: scaling numeric and one-hot encoding small categorical features, target encoding for high-cardinality

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import TargetEncoder
num_features = features
high_card = ['nationality']
cat_small = ['pos_bucket']
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('te', TargetEncoder(), high_card),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_small)
], remainder='drop')
# fit transformer quickly
preprocessor.fit(X_train, y_train)
X_train_p = preprocessor.transform(X_train)
X_test_p = preprocessor.transform(X_test)
print('Processed shapes:', X_train_p.shape, X_test_p.shape)


## Train Random Forest (on preprocessed features)

In [9]:
rf = RandomForestRegressor(n_estimators=150, max_depth=12, random_state=42, n_jobs=-1)
rf.fit(X_train_p, y_train)
rf_preds = rf.predict(X_test_p)
print('RF RMSE:', mean_squared_error(y_test, rf_preds, squared=False))
print('RF R2:', r2_score(y_test, rf_preds))


## Train XGBoost (on preprocessed features)

In [10]:
xgb = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=42, verbosity=0)
# XGBoost accepts numpy arrays
xgb.fit(X_train_p, y_train, eval_set=[(X_test_p, y_test)], early_stopping_rounds=20, verbose=False)
xgb_preds = xgb.predict(X_test_p)
print('XGB RMSE:', mean_squared_error(y_test, xgb_preds, squared=False))
print('XGB R2:', r2_score(y_test, xgb_preds))


## Save best model and preprocessor (pipeline)

In [11]:
import joblib
# choose better model by RMSE
rf_rmse = mean_squared_error(y_test, rf_preds, squared=False)
xgb_rmse = mean_squared_error(y_test, xgb_preds, squared=False)
if xgb_rmse < rf_rmse:
    best = ('xgb', xgb)
else:
    best = ('rf', rf)
# save preprocessor and model
joblib.dump(preprocessor, 'models/preprocessor.pkl')
joblib.dump(best[1], f"models/best_model_{best[0]}.pkl")
print('Saved model:', best[0])


## SHAP explanation for the best model (XGBoost or RF)

In [12]:
model = joblib.load(f"models/best_model_{best[0]}.pkl")
# For RF and XGB, TreeExplainer works
explainer = shap.TreeExplainer(model)
# Use a small subset for SHAP
X_shap = X_test_p[:200]
shap_values = explainer.shap_values(X_shap)
# summary plot
shap.summary_plot(shap_values, X_shap)


## Final: Show a few original-valued comparisons (convert back from log1p)

In [13]:
preds_orig = np.expm1(xgb_preds if best[0]=='xgb' else rf_preds)
actual_orig = np.expm1(y_test.values)
comp = pd.DataFrame({'pred':preds_orig[:10], 'actual': actual_orig[:10]})
comp
