# Commandes de base
- python.exe -m pip install --upgrade pip
- python3 -m venv .venv
- .\.venv\Scripts\activate

### Commande 1: Mise à jour de `pip`
```sh
python.exe -m pip install --upgrade pip
```

**Avantages** :
- **Sécurité** : Les mises à jour incluent souvent des correctifs de sécurité.
- **Compatibilité** : Les nouvelles versions sont souvent mieux compatibles avec les nouvelles versions de Python et les nouveaux paquets.
- **Fonctionnalités et Stabilité** : Les mises à jour incluent de nouvelles fonctionnalités et corrections de bugs.

### Commande 2: Création d'un environnement virtuel
```sh
python3 -m venv .venv
```

**Avantages** :
- **Isolation** : Les environnements virtuels permettent d'isoler les dépendances des projets. Cela évite les conflits de versions entre les projets.
- **Facilité de gestion** : Chaque projet peut avoir ses propres dépendances spécifiques, indépendamment des autres projets.

### Commande 3: Activation de l'environnement virtuel
```sh
.\.venv\Scripts\activate
```

**Avantages** :
- **Utilisation des dépendances spécifiques** : Une fois activé, l'environnement virtuel utilise les versions de paquets installées dans cet environnement, plutôt que celles installées globalement.
- **Contexte de développement propre** : Permet de travailler dans un contexte de développement propre et spécifique au projet.

In [None]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()

# Collecte et Préparation des Données

In [None]:
import pandas as pd

# Charger le fichier CSV
df=pd.read_csv(r"dataset/transfers.csv")
df.head(5)

In [None]:
print(df.info())

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# Traitement des valeurs nulles

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import numpy as np

# reload the dataset
data = df.copy()

# Handle missing values
data.ffill(inplace=True)
data.is_free = data.is_free.apply(lambda x: True if x=='TRUE' else False)
#data.is_loan = data.is_loan.apply(lambda x: True if x=='TRUE' else False)
#data.is_loan_end = data.is_loan_end.apply(lambda x: True if x=='TRUE' else False)
data.is_retired = data.is_retired.apply(lambda x: True if x=='TRUE' else False)

In [None]:
# Vérifier les valeurs manquantes
data.isnull().sum()

# Choix des variables pertinentes

In [None]:
# Define columns of interest
columns_of_interest = [
    'league', 'season', 'team_name', 'team_country', 'player_name', 'player_age',
    'counter_team_name', 'counter_team_country', 'transfer_fee_amnt', 'market_val_amnt', 'is_free', 'is_retired'
]

# Select only the columns of interest
data = data[columns_of_interest]

# Encodage des varaibles pertinentes

In [None]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Création des paires afin de réduire le dataset

Car nous avons rencontré un temps trop élévé de compilation

In [None]:
# Create pairs of data with a reduced dataset
def create_random_pairs(data, num_pairs=10000):
    pairs = []
    labels = []
    num_samples = len(data)
    
    for _ in range(num_pairs):
        i, j = np.random.choice(num_samples, 2, replace=False)
        pairs.append((data.iloc[i], data.iloc[j]))
        # Label 1 for similar (same player_name and season), 0 for non-similar (different player_name or season)
        labels.append(1 if data.iloc[i]['team_name'] == data.iloc[j]['team_name'] else 0)
    
    return np.array(pairs), np.array(labels)

# Reduce the dataset size for demonstration
reduced_data = data.sample(n=1000, random_state=42)

pairs, labels = create_random_pairs(reduced_data)

# Séparation du dataset pour entrainement du modèle

In [None]:
# Split the pairs into training and testing sets
pairs_train, pairs_test, labels_train, labels_test = train_test_split(pairs, labels, test_size=0.2, random_state=42)

# Prepare the input for the model
def prepare_input(pairs):
    X = []
    for pair in pairs:
        X.append(np.concatenate((pair[0], pair[1])))
    return np.array(X)

# Amélioration des balances entre les classes (paires)

In [None]:
# Balance the classes using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = prepare_input(pairs_train), labels_train
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Entrainement et évaluation du modèle : RandomForest

In [None]:
X_test = prepare_input(pairs_test)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_balanced, y_train_balanced)

# Evaluate the model
accuracy = model.score(X_test, labels_test)
print("Model Accuracy:", accuracy)

# Make predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Print evaluation metrics
print("Accuracy:", accuracy_score(labels_test, y_pred))
print("Classification Report:\n", classification_report(labels_test, y_pred))
print("ROC AUC Score:", roc_auc_score(labels_test, y_prob))

# Comparons ce qui sera rentré par l'utilisateur avec ce qui sera appris du modèle

In [None]:
# Function to compare user input with the model and identify likely false fields
def compare_user_input(user_input, data, model, label_encoders):
    # Encode the user input
    user_input_encoded = {}
    for column in columns_of_interest:
        if column in label_encoders:
            # Add new values to the encoder
            if user_input.get(column, '') not in label_encoders[column].classes_:
                label_encoders[column].classes_ = np.append(label_encoders[column].classes_, user_input.get(column, ''))
            user_input_encoded[column] = label_encoders[column].transform([user_input.get(column, '')])[0]
        else:
            user_input_encoded[column] = user_input.get(column, 0)

    user_input_array = np.array([user_input_encoded.get(col, 0) for col in columns_of_interest])

    # Create pairs with examples from the dataset
    pairs = []
    for i in range(len(data)):
        data_row = data.iloc[i].values
        pairs.append(np.concatenate((user_input_array, data_row)))
    
    pairs = np.array(pairs)
    
    # Predict similarity probabilities
    probabilities = model.predict_proba(pairs)[:, 1]  # Probability of being similar (label 1)
    
    # Calculate the overall truth percentage
    max_probability = np.max(probabilities)
    truth_percentage = max_probability * 100  # Convert to percentage

    # Identify likely false fields
    likely_false_fields = []
    for column in columns_of_interest:
        modified_input = user_input_array.copy()
        original_value = modified_input[columns_of_interest.index(column)]
        for i in range(len(data)):
            data_row = data.iloc[i].values
            # Change the value of the column to match the dataset and see if probability increases
            modified_input[columns_of_interest.index(column)] = data_row[columns_of_interest.index(column)]
            modified_pairs = [np.concatenate((modified_input, data_row))]
            modified_probability = model.predict_proba(modified_pairs)[:, 1]
            if np.max(modified_probability) > max_probability:
                likely_false_fields.append(column)
                break
            modified_input[columns_of_interest.index(column)] = original_value  # Reset to original value

    return truth_percentage, likely_false_fields


# Test

In [None]:
# Example user input
user_input = {
    'league': 'GB1',
    'season': 2009,
    'team_name': 'Abdoulaye-ismael KOULIBALY',
    'team_country': 'Lindsay CENESCART MARSEILLE',
    'player_name': 'TCHATHCOU SINKAM Wilfried',
    'player_age': 24.0,
    'player_nation': 'Joan Cindy MIKONGO OUAMBO',
    'counter_team_name': 'Guillaume WALES',
    'counter_team_country': 'Gilchrist DONHISSOU',
    'transfer_fee_amnt': 94000000.0,
    'market_val_amnt': 45000000.0,
    'is_free': False,
    'is_retired': False
}

# Compare user input with the model
truth_percentage, likely_false_fields = compare_user_input(user_input, reduced_data, model, label_encoders)
print("Truth Percentage:", truth_percentage)
print("Likely False Fields:", likely_false_fields)

# Entrainement et Evaluation de la Régression Logistique

In [None]:
from sklearn.linear_model import LogisticRegression

# Entrainement d'un modèle de régression logistique
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_balanced, y_train_balanced)

# Evaluation du modèle
logistic_accuracy = logistic_model.score(X_test, labels_test)
print("Logistic Model Accuracy:", logistic_accuracy)

# Prédictions
logistic_y_pred = logistic_model.predict(X_test)
logistic_y_prob = logistic_model.predict_proba(X_test)[:, 1]

# Impression des métriques d'évaluation
print("Logistic Accuracy:", accuracy_score(labels_test, logistic_y_pred))
print("Logistic Classification Report:\n", classification_report(labels_test, logistic_y_pred))
print("Logistic ROC AUC Score:", roc_auc_score(labels_test, logistic_y_prob))

# Entrainement et Evaluation de la Régression Linéaire

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Binarizer

# Entrainement d'un modèle de régression linéaire
linear_model = LinearRegression()
linear_model.fit(X_train_balanced, y_train_balanced)

# Evaluation du modèle
y_pred_continuous = linear_model.predict(X_test)

# Convertir les prédictions continues en classifications binaires
binarizer = Binarizer(threshold=0.5)
y_pred_binary = binarizer.fit_transform(y_pred_continuous.reshape(-1, 1)).reshape(-1)

# Calculer les probabilités pour ROC AUC Score
y_prob = y_pred_continuous

# Impression des métriques d'évaluation
print("Linear Regression Model Accuracy:", accuracy_score(labels_test, y_pred_binary))
print("Linear Regression Classification Report:\n", classification_report(labels_test, y_pred_binary))
print("Linear Regression ROC AUC Score:", roc_auc_score(labels_test, y_prob))

# Entrainement et Evaluation du Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Entrainement d'un modèle Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_balanced, y_train_balanced)

# Evaluation du modèle
gb_accuracy = gb_model.score(X_test, labels_test)
print("Gradient Boosting Model Accuracy:", gb_accuracy)

# Prédictions
gb_y_pred = gb_model.predict(X_test)
gb_y_prob = gb_model.predict_proba(X_test)[:, 1]

# Impression des métriques d'évaluation
print("Gradient Boosting Accuracy:", accuracy_score(labels_test, gb_y_pred))
print("Gradient Boosting Classification Report:\n", classification_report(labels_test, gb_y_pred))
print("Gradient Boosting ROC AUC Score:", roc_auc_score(labels_test, gb_y_prob))

# Entrainement et Evaluation du SVM (Support Vector Machine)

In [None]:
from sklearn.svm import SVC

# Entrainement d'un modèle SVM
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_balanced, y_train_balanced)

# Evaluation du modèle
svm_accuracy = svm_model.score(X_test, labels_test)
print("SVM Model Accuracy:", svm_accuracy)

# Prédictions
svm_y_pred = svm_model.predict(X_test)
svm_y_prob = svm_model.predict_proba(X_test)[:, 1]

# Impression des métriques d'évaluation
print("SVM Accuracy:", accuracy_score(labels_test, svm_y_pred))
print("SVM Classification Report:\n", classification_report(labels_test, svm_y_pred))
print("SVM ROC AUC Score:", roc_auc_score(labels_test, svm_y_prob))

# Conclusion générale

Au vue de ces différents modèles, nous pouvons conserver uniquement le modelèle de RandomForest qui présente de meilleurs caractériques par rapports aux autres.

# Enregistrement et nouveau test en local

In [None]:
# Sauvegarde du meilleur modèle
import joblib

# Sauvegarder le modèle
joblib.dump(model, 'model/best_model.pkl')

# Sauvegarder les encoders
joblib.dump(label_encoders, 'model/label_encoders.pkl')

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Charger les données pour créer les encoders
data = df.copy()

# Colonnes d'intérêt
columns_of_interest = [
    'league', 'season', 'team_name', 'team_country', 'player_name', 'player_age',
    'counter_team_name', 'counter_team_country', 'transfer_fee_amnt', 'market_val_amnt', 'is_free', 'is_retired'
]

# Créer les encoders pour les colonnes catégorielles
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Charger le modèle
model = joblib.load('model/best_model.pkl')

# Fonction de comparaison des entrées utilisateur
def compare_user_input(user_input, data, model, label_encoders):
    # Encode the user input
    user_input_encoded = {}
    for column in columns_of_interest:
        if column in label_encoders:
            if user_input.get(column, '') not in label_encoders[column].classes_:
                label_encoders[column].classes_ = np.append(label_encoders[column].classes_, user_input.get(column, ''))
            user_input_encoded[column] = label_encoders[column].transform([user_input.get(column, '')])[0]
        else:
            user_input_encoded[column] = user_input.get(column, 0)

    user_input_array = np.array([user_input_encoded.get(col, 0) for col in columns_of_interest])

    # Create pairs with examples from the dataset
    pairs = []
    for i in range(len(data)):
        data_row = data.iloc[i].values
        pairs.append(np.concatenate((user_input_array, data_row)))

    pairs = np.array(pairs)

    # Predict similarity probabilities
    probabilities = model.predict_proba(pairs)[:, 1]  # Probability of being similar (label 1)

    # Calculate the overall truth percentage
    max_probability = np.max(probabilities)
    truth_percentage = max_probability * 100  # Convert to percentage

    # Identify likely false fields
    likely_false_fields = []
    for column in columns_of_interest:
        modified_input = user_input_array.copy()
        original_value = modified_input[columns_of_interest.index(column)]
        for i in range(len(data)):
            data_row = data.iloc[i].values
            # Change the value of the column to match the dataset and see if probability increases
            modified_input[columns_of_interest.index(column)] = data_row[columns_of_interest.index(column)]
            modified_pairs = [np.concatenate((modified_input, data_row))]
            modified_probability = model.predict_proba(modified_pairs)[:, 1]
            if np.max(modified_probability) > max_probability:
                likely_false_fields.append(column)
                break
            modified_input[columns_of_interest.index(column)] = original_value  # Reset to original value

    return truth_percentage, likely_false_fields

# Example user input
user_input = {
    'league': 'GB1',
    'season': 2009,
    'team_name': 'Manchester United',
    'team_country': 'England',
    'player_name': 'Cristiano Ronaldo',
    'player_age': 24.0,
    'player_nation': 'Portugal',
    'counter_team_name': 'Real Madrid',
    'counter_team_country': 'Spain',
    'transfer_fee_amnt': 94000000.0,
    'market_val_amnt': 45000000.0,
    'is_free': False,
    'is_retired': False
}

# Comparer l'entrée utilisateur avec le modèle
truth_percentage, likely_false_fields = compare_user_input(user_input, reduced_data, model, label_encoders)
print("Truth Percentage:", truth_percentage)
print("Likely False Fields:", likely_false_fields)
