In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# === Step 1: Load Data ===
player_df = pd.read_csv("cleaned_player_profiles.csv")
club_df = pd.read_csv("cleaned_club_profiles.csv")
search_df = pd.read_csv("cleaned_search_history.csv")

# === Step 2: Fix vector formats ===
def fix_and_parse_vector_column(df, column_name):
    return df[column_name].apply(lambda x: np.array(eval(x.replace(" ", ",")), dtype=np.float32))

player_df['play_style_vector'] = fix_and_parse_vector_column(player_df, 'play_style_vector')
player_df['stats_vector'] = fix_and_parse_vector_column(player_df, 'stats_vector')
club_df['style_of_play'] = fix_and_parse_vector_column(club_df, 'style_of_play')
search_df['style_fit_vector'] = fix_and_parse_vector_column(search_df, 'style_fit_vector')

# === Step 3: Feature Encoding ===
player_df['position_encoded'] = LabelEncoder().fit_transform(player_df['position_main'])
club_df['league_encoded'] = LabelEncoder().fit_transform(club_df['league'])

# === Step 4: Normalize vectors ===
scaler_stats = StandardScaler()
scaler_style = StandardScaler()

player_df['stats_scaled'] = list(scaler_stats.fit_transform(player_df['stats_vector'].tolist()))
club_df['style_scaled'] = list(scaler_style.fit_transform(club_df['style_of_play'].tolist()))

# === Step 5: Generate positive training samples ===
positive_pairs = []
for _, row in search_df.iterrows():
    player_id = row['player_id']
    club_id = random.choice(club_df['club_id'].values)
    positive_pairs.append({
        'player_id': player_id,
        'club_id': club_id,
        'label': 1
    })

# === Step 6: Generate negative samples ===
player_ids = player_df['player_id'].unique()
club_ids = club_df['club_id'].unique()
negative_pairs = []
existing_pairs = {(p['player_id'], p['club_id']) for p in positive_pairs}

while len(negative_pairs) < len(positive_pairs):
    p_id = random.choice(player_ids)
    c_id = random.choice(club_ids)
    if (p_id, c_id) not in existing_pairs:
        negative_pairs.append({
            'player_id': p_id,
            'club_id': c_id,
            'label': 0
        })
        existing_pairs.add((p_id, c_id))

# === Step 7: Sample to avoid memory issues ===
positive_sample = random.sample(positive_pairs, k=min(2000, len(positive_pairs)))
negative_sample = random.sample(negative_pairs, k=min(2000, len(negative_pairs)))

combined_data = pd.DataFrame(positive_sample + negative_sample).sample(frac=1).reset_index(drop=True)

# === Step 8: Merge features ===
merged = combined_data.merge(
    player_df[['player_id', 'position_encoded', 'stats_scaled']],
    on='player_id'
).merge(
    club_df[['club_id', 'league_encoded', 'style_scaled']],
    on='club_id'
)

# === Step 9: Train/Validation Split ===
train_data, val_data = train_test_split(merged, test_size=0.2, random_state=42)

# Your dataset is ready: train_data and val_data

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
import numpy as np

# === Step 10: Prepare X and y ===
def prepare_features(df):
    player_input = np.stack(df['stats_scaled'].values)
    position_input = df['position_encoded'].values.reshape(-1, 1)
    club_input = np.stack(df['style_scaled'].values)
    league_input = df['league_encoded'].values.reshape(-1, 1)
    return [player_input, position_input, club_input, league_input]

X_train = prepare_features(train_data)
X_val = prepare_features(val_data)
y_train = train_data['label'].values
y_val = val_data['label'].values

# === Step 11: Dynamically Set Embedding Dimensions ===
num_positions = int(train_data['position_encoded'].max()) + 1
num_leagues = int(train_data['league_encoded'].max()) + 1

# === Step 12: Define the Model ===

# Player stream
player_stats = layers.Input(shape=(6,), name='player_stats')  # 6 stats
player_position = layers.Input(shape=(1,), name='player_position')
pos_embed = layers.Embedding(input_dim=num_positions, output_dim=4)(player_position)
pos_flat = layers.Flatten()(pos_embed)
player_concat = layers.Concatenate()([player_stats, pos_flat])
player_dense = layers.Dense(64, activation='relu')(player_concat)

# Club stream
club_style = layers.Input(shape=(15,), name='club_style')  # 15 style features
club_league = layers.Input(shape=(1,), name='club_league')
league_embed = layers.Embedding(input_dim=num_leagues, output_dim=4)(club_league)
league_flat = layers.Flatten()(league_embed)
club_concat = layers.Concatenate()([club_style, league_flat])
club_dense = layers.Dense(64, activation='relu')(club_concat)

# Merge both streams
merged = layers.Concatenate()([player_dense, club_dense])
x = layers.Dense(64, activation='relu')(merged)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs=[player_stats, player_position, club_style, club_league], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# === Step 13: Train the Model ===
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32
)

# === Step 14: Evaluate ===
loss, acc = model.evaluate(X_val, y_val)
print(f"✅ Validation Accuracy: {acc:.2f}")

Epoch 1/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 3ms/step - accuracy: 0.9512 - loss: 0.1432 - val_accuracy: 0.9557 - val_loss: 0.1157
Epoch 2/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 3ms/step - accuracy: 0.9561 - loss: 0.1149 - val_accuracy: 0.9561 - val_loss: 0.1120
Epoch 3/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3ms/step - accuracy: 0.9567 - loss: 0.1118 - val_accuracy: 0.9581 - val_loss: 0.1067
Epoch 4/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 3ms/step - accuracy: 0.9573 - loss: 0.1099 - val_accuracy: 0.9569 - val_loss: 0.1073
Epoch 5/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 3ms/step - accuracy: 0.9576 - loss: 0.1088 - val_accuracy: 0.9582 - val_loss: 0.1049
Epoch 6/10
[1m36995/36995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 3ms/step - accuracy: 0.9580 - loss: 0.1075 - val_accuracy: 0.9582 - val_loss:

In [None]:
def recommend_clubs_for_player_deep(player_id, top_n=5):
    # Get player row
    player_row = player_df[player_df['player_id'] == player_id].iloc[0]
    stats_input = player_row['stats_scaled']
    position_input = player_row['position_encoded']

    # Prepare batch for all clubs
    clubs = club_df.copy()
    num_clubs = len(clubs)

    X_pred = [
        np.tile(stats_input, (num_clubs, 1)),  # repeat player's stats
        np.full((num_clubs, 1), position_input),  # repeat position
        np.stack(clubs['style_scaled'].values),  # club styles
        clubs['league_encoded'].values.reshape(-1, 1)  # league
    ]

    # Predict compatibility scores
    scores = model.predict(X_pred).flatten()
    clubs['match_score'] = scores

    # Return top N
    return clubs.sort_values(by='match_score', ascending=False)[['club_name', 'league', 'match_score']].head(top_n)

# Recommend for Kylian Mbappé (player_id = 1888)
top_clubs = recommend_clubs_for_player_deep(player_id=1, top_n=5)
top_clubs

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


Unnamed: 0,club_name,league,match_score
485,FC Ingolstadt 04,3. Liga,0.827106
314,Lausanne-Sport,CSSL,0.007455
363,CF Montréal,MLS,0.000862
100,AEK Athens,Hellas Liga,0.000826
95,Red Bulls,MLS,0.000714
