In [6]:
# === STEP 0: IMPORTS ===
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# === STEP 1: LOAD DATA ===
player_df = pd.read_csv("cleaned_player_profiles.csv")
club_df = pd.read_csv("cleaned_club_profiles.csv")
search_df = pd.read_csv("cleaned_search_history.csv")

# === STEP 2: FIX VECTORS ===
def fix_and_parse_vector_column(df, column_name):
    return df[column_name].apply(lambda x: np.array(eval(x.replace(" ", ",")), dtype=np.float32))

player_df['stats_vector'] = fix_and_parse_vector_column(player_df, 'stats_vector')
player_df['play_style_vector'] = fix_and_parse_vector_column(player_df, 'play_style_vector')
club_df['style_of_play'] = fix_and_parse_vector_column(club_df, 'style_of_play')
search_df['style_fit_vector'] = fix_and_parse_vector_column(search_df, 'style_fit_vector')

# === STEP 3: ENCODE CATEGORICAL FEATURES ===
player_df['position_encoded'] = LabelEncoder().fit_transform(player_df['position_main'])
club_df['league_encoded'] = LabelEncoder().fit_transform(club_df['league'])

# === STEP 4: NORMALIZE VECTORS ===
scaler_style = StandardScaler()
club_df['style_scaled'] = list(scaler_style.fit_transform(club_df['style_of_play'].tolist()))

# === STEP 5: EXTRACT LAST SEARCH STYLE ===
latest_search = search_df.sort_values(by='timestamp').groupby('player_id').last().reset_index()
player_df = player_df.merge(latest_search[['player_id', 'style_fit_vector']], on='player_id', how='left')
player_df['style_fit_vector'] = player_df.apply(
    lambda row: row['style_fit_vector'] if isinstance(row['style_fit_vector'], np.ndarray) else row['play_style_vector'],
    axis=1
)
player_df['style_fit_scaled'] = list(scaler_style.transform(player_df['style_fit_vector'].tolist()))

# === STEP 6: GENERATE TRAINING PAIRS ===
positive_pairs = [
    {'player_id': row['player_id'], 'club_id': random.choice(club_df['club_id'].values), 'label': 1}
    for _, row in latest_search.iterrows()
]

player_ids = player_df['player_id'].unique()
club_ids = club_df['club_id'].unique()
negative_pairs = []
existing_pairs = {(p['player_id'], p['club_id']) for p in positive_pairs}

while len(negative_pairs) < len(positive_pairs):
    p_id = random.choice(player_ids)
    c_id = random.choice(club_ids)
    if (p_id, c_id) not in existing_pairs:
        negative_pairs.append({'player_id': p_id, 'club_id': c_id, 'label': 0})
        existing_pairs.add((p_id, c_id))

# === STEP 7: USE 100% OF THE DATA ===
all_data = pd.DataFrame(positive_pairs + negative_pairs).sample(frac=1).reset_index(drop=True)
merged = all_data.merge(
    player_df[['player_id', 'position_encoded', 'style_fit_scaled']], on='player_id'
).merge(
    club_df[['club_id', 'league_encoded', 'style_scaled']], on='club_id'
)

train_data, val_data = train_test_split(merged, test_size=0.2, random_state=42)

# === STEP 8: PREPARE INPUTS ===
def prepare_features(df):
    return [
        np.stack(df['style_fit_scaled'].values),
        df['position_encoded'].values.reshape(-1, 1),
        np.stack(df['style_scaled'].values),
        df['league_encoded'].values.reshape(-1, 1)
    ]

X_train = prepare_features(train_data)
X_val = prepare_features(val_data)
y_train = train_data['label'].values
y_val = val_data['label'].values

# === STEP 9: BUILD MODEL ===
num_positions = int(train_data['position_encoded'].max()) + 1
num_leagues = int(train_data['league_encoded'].max()) + 1

player_style = layers.Input(shape=(15,), name='player_style')
player_position = layers.Input(shape=(1,), name='player_position')
club_style = layers.Input(shape=(15,), name='club_style')
club_league = layers.Input(shape=(1,), name='club_league')

pos_embed = layers.Embedding(input_dim=num_positions, output_dim=4)(player_position)
league_embed = layers.Embedding(input_dim=num_leagues, output_dim=4)(club_league)
pos_flat = layers.Flatten()(pos_embed)
league_flat = layers.Flatten()(league_embed)

player_features = layers.Concatenate()([player_style, pos_flat])
club_features = layers.Concatenate()([club_style, league_flat])

player_dense = layers.Dense(64, activation='relu')(player_features)
club_dense = layers.Dense(64, activation='relu')(club_features)

merged_features = layers.Concatenate()([player_dense, club_dense])
x = layers.Dense(64, activation='relu')(merged_features)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs=[player_style, player_position, club_style, club_league], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# === STEP 10: TRAIN MODEL ===
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# === STEP 11: EVALUATE ===
y_pred_probs = model.predict(X_val).flatten()
y_pred_classes = y_pred_probs > 0.5

print("\n✅ Final Evaluation")
print("Accuracy:", model.evaluate(X_val, y_val, verbose=0)[1])
print("Precision:", precision_score(y_val, y_pred_classes))
print("Recall:", recall_score(y_val, y_pred_classes))
print("F1 Score:", f1_score(y_val, y_pred_classes))
print("ROC AUC:", roc_auc_score(y_val, y_pred_probs))

# === STEP 12: CLUB RECOMMENDATION BASED ON SEARCH STYLE ===
def recommend_clubs_for_player(player_id, top_n=5):
    player = player_df[player_df['player_id'] == player_id].iloc[0]
    style_input = player['style_fit_scaled']
    position_input = player['position_encoded']

    clubs = club_df.copy()
    num_clubs = len(clubs)

    X_pred = [
        np.tile(style_input, (num_clubs, 1)),
        np.full((num_clubs, 1), position_input),
        np.stack(clubs['style_scaled'].values),
        clubs['league_encoded'].values.reshape(-1, 1)
    ]

    scores = model.predict(X_pred).flatten()
    clubs['match_score'] = scores
    return clubs.sort_values(by='match_score', ascending=False)[['club_name', 'league', 'match_score']].head(top_n)

# Example usage:
# top_recommendations = recommend_clubs_for_player(player_id=1888, top_n=5)
# print(top_recommendations)


Epoch 1/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7481 - loss: 0.5287 - val_accuracy: 0.9655 - val_loss: 0.0743
Epoch 2/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9636 - loss: 0.0751 - val_accuracy: 0.9653 - val_loss: 0.0592
Epoch 3/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.9655 - loss: 0.0639 - val_accuracy: 0.9668 - val_loss: 0.0586
Epoch 4/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9652 - loss: 0.0581 - val_accuracy: 0.9655 - val_loss: 0.0614
Epoch 5/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9668 - loss: 0.0551 - val_accuracy: 0.9666 - val_loss: 0.0570
Epoch 6/10
[1m775/775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9680 - loss: 0.0546 - val_accuracy: 0.9650 - val_loss: 0.0587
Epoch 7/10
[1m775/775[0m 

In [10]:
# === STEP 0: IMPORTS ===
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# === STEP 1: LOAD DATA ===
player_df = pd.read_csv("cleaned_player_profiles.csv")
club_df = pd.read_csv("cleaned_club_profiles.csv")
search_df = pd.read_csv("cleaned_search_history.csv")

# === STEP 2: FIX VECTORS ===
def fix_and_parse_vector_column(df, column_name):
    return df[column_name].apply(lambda x: np.array(eval(x.replace(" ", ",")), dtype=np.float32))

player_df['stats_vector'] = fix_and_parse_vector_column(player_df, 'stats_vector')
player_df['play_style_vector'] = fix_and_parse_vector_column(player_df, 'play_style_vector')
club_df['style_of_play'] = fix_and_parse_vector_column(club_df, 'style_of_play')
search_df['style_fit_vector'] = fix_and_parse_vector_column(search_df, 'style_fit_vector')

# === STEP 3: ENCODE CATEGORICAL FEATURES ===
player_df['position_encoded'] = LabelEncoder().fit_transform(player_df['position_main'])
club_df['league_encoded'] = LabelEncoder().fit_transform(club_df['league'])

# === STEP 4: NORMALIZE VECTORS ===
scaler_style = StandardScaler()
club_df['style_scaled'] = list(scaler_style.fit_transform(club_df['style_of_play'].tolist()))

# === STEP 5: EXTRACT LAST SEARCH STYLE ===
latest_search = search_df.sort_values(by='timestamp').groupby('player_id').last().reset_index()
player_df = player_df.merge(latest_search[['player_id', 'style_fit_vector']], on='player_id', how='left')
player_df['style_fit_vector'] = player_df.apply(
    lambda row: row['style_fit_vector'] if isinstance(row['style_fit_vector'], np.ndarray) else row['play_style_vector'],
    axis=1
)
player_df['style_fit_scaled'] = list(scaler_style.transform(player_df['style_fit_vector'].tolist()))

# === STEP 6: GENERATE TRAINING PAIRS ===
positive_pairs = [
    {'player_id': row['player_id'], 'club_id': random.choice(club_df['club_id'].values), 'label': 1}
    for _, row in latest_search.iterrows()
]

player_ids = player_df['player_id'].unique()
club_ids = club_df['club_id'].unique()
negative_pairs = []
existing_pairs = {(p['player_id'], p['club_id']) for p in positive_pairs}

while len(negative_pairs) < len(positive_pairs):
    p_id = random.choice(player_ids)
    c_id = random.choice(club_ids)
    if (p_id, c_id) not in existing_pairs:
        negative_pairs.append({'player_id': p_id, 'club_id': c_id, 'label': 0})
        existing_pairs.add((p_id, c_id))

# === STEP 7: USE 100% OF THE DATA ===
all_data = pd.DataFrame(positive_pairs + negative_pairs).sample(frac=1).reset_index(drop=True)
merged = all_data.merge(
    player_df[['player_id', 'position_encoded', 'style_fit_scaled']], on='player_id'
).merge(
    club_df[['club_id', 'league_encoded', 'style_scaled']], on='club_id'
)

train_data, val_data = train_test_split(merged, test_size=0.2, random_state=42)

# === STEP 8: PREPARE INPUTS ===
def prepare_features(df):
    return [
        np.stack(df['style_fit_scaled'].values),
        df['position_encoded'].values.reshape(-1, 1),
        np.stack(df['style_scaled'].values),
        df['league_encoded'].values.reshape(-1, 1)
    ]

X_train = prepare_features(train_data)
X_val = prepare_features(val_data)
y_train = train_data['label'].values
y_val = val_data['label'].values

# === STEP 9: BUILD MODEL ===
num_positions = int(player_df['position_encoded'].max()) + 1
num_leagues = int(club_df['league_encoded'].max()) + 1

player_style = layers.Input(shape=(15,), name='player_style')
player_position = layers.Input(shape=(1,), name='player_position')
club_style = layers.Input(shape=(15,), name='club_style')
club_league = layers.Input(shape=(1,), name='club_league')

pos_embed = layers.Embedding(input_dim=num_positions, output_dim=4)(player_position)
league_embed = layers.Embedding(input_dim=num_leagues, output_dim=4)(club_league)
pos_flat = layers.Flatten()(pos_embed)
league_flat = layers.Flatten()(league_embed)

player_features = layers.Concatenate()([player_style, pos_flat])
club_features = layers.Concatenate()([club_style, league_flat])

player_dense = layers.Dense(64, activation='relu')(player_features)
club_dense = layers.Dense(64, activation='relu')(club_features)

merged_features = layers.Concatenate()([player_dense, club_dense])
x = layers.Dense(64, activation='relu')(merged_features)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation='sigmoid')(x)

model = Model(inputs=[player_style, player_position, club_style, club_league], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# === STEP 10: TRAIN MODEL ===
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# === STEP 11: EVALUATE ===
y_pred_probs = model.predict(X_val).flatten()
y_pred_classes = y_pred_probs > 0.5

print("\n✅ Final Evaluation")
print("Accuracy:", model.evaluate(X_val, y_val, verbose=0)[1])
print("Precision:", precision_score(y_val, y_pred_classes))
print("Recall:", recall_score(y_val, y_pred_classes))
print("F1 Score:", f1_score(y_val, y_pred_classes))
print("ROC AUC:", roc_auc_score(y_val, y_pred_probs))

# === STEP 12: CLUB RECOMMENDATION BASED ON SEARCH STYLE ===
def recommend_clubs_for_player(player_id, top_n=5):
    player = player_df[player_df['player_id'] == player_id].iloc[0]
    style_input = player['style_fit_scaled']
    position_input = player['position_encoded']

    clubs = club_df.copy()
    num_clubs = len(clubs)

    X_pred = [
        np.tile(style_input, (num_clubs, 1)),
        np.full((num_clubs, 1), position_input),
        np.stack(clubs['style_scaled'].values),
        clubs['league_encoded'].values.reshape(-1, 1)
    ]

    scores = model.predict(X_pred).flatten()
    clubs['match_score'] = scores
    return clubs.sort_values(by='match_score', ascending=False)[['club_name', 'league', 'match_score']].head(top_n)

# Example usage:
# top_recommendations = recommend_clubs_for_player(player_id=1888, top_n=5)
# print(top_recommendations)


Epoch 1/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8009 - loss: 0.4131 - val_accuracy: 0.9886 - val_loss: 0.0379
Epoch 2/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9852 - loss: 0.0493 - val_accuracy: 0.9883 - val_loss: 0.0278
Epoch 3/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9872 - loss: 0.0282 - val_accuracy: 0.9881 - val_loss: 0.0249
Epoch 4/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9859 - loss: 0.0295 - val_accuracy: 0.9881 - val_loss: 0.0241
Epoch 5/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9878 - loss: 0.0247 - val_accuracy: 0.9881 - val_loss: 0.0238
Epoch 6/10
[1m768/768[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9878 - loss: 0.0251 - val_accuracy: 0.9888 - val_loss: 0.0245
Epoch 7/10
[1m768/768[0m 

In [11]:
# Display 5 player search styles and their top 3 recommended clubs (based on style_fit_vector)

# Ensure model is already trained and data is preprocessed (from previous cells)
examples = player_df.sample(5, random_state=42)  # 5 random players with search data
results = []

for _, row in examples.iterrows():
    player_id = row['player_id']
    name = row.get('name', f"Player {player_id}")
    style_vector = row['style_fit_vector']
    top_clubs = club_df.copy()

    # Prepare prediction inputs
    X_pred = [
        np.tile(row['style_fit_scaled'], (len(top_clubs), 1)),
        np.full((len(top_clubs), 1), row['position_encoded']),
        np.stack(top_clubs['style_scaled'].values),
        top_clubs['league_encoded'].values.reshape(-1, 1)
    ]

    # Predict and attach scores
    scores = model.predict(X_pred, verbose=0).flatten()
    top_clubs['match_score'] = scores
    top_3 = top_clubs.sort_values(by='match_score', ascending=False).head(3)[['club_name', 'league', 'match_score']]

    results.append({
        'player_id': player_id,
        'player_name': name,
        'style_fit_vector': style_vector,
        'recommended_clubs': top_3.reset_index(drop=True)
    })

results


[{'player_id': 2318,
  'player_name': 'Tasos Douvikas',
  'style_fit_vector': array([1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1.],
        dtype=float32),
  'recommended_clubs':        club_name league  match_score
  0        FC Sion   CSSL     0.999998
  1      GC Zürich   CSSL     0.999998
  2  FC Winterthur   CSSL     0.999998},
 {'player_id': 6884,
  'player_name': 'Altin Zeqiri',
  'style_fit_vector': array([0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1.],
        dtype=float32),
  'recommended_clubs':         club_name            league  match_score
  0  Lausanne-Sport              CSSL          1.0
  1     Real Madrid  LALIGA EA SPORTS          1.0
  2   FCV Dender EH     1A Pro League          1.0},
 {'player_id': 14644,
  'player_name': 'Fynn Otto',
  'style_fit_vector': array([0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        dtype=float32),
  'recommended_clubs':         club_name            league  match_score
  0     Real Madri