In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import time
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, RidgeCV
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,root_mean_squared_error
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import itertools

from utils import clean_data
from utils import split_trajectories
from utils import replicate_initial_position_by_block
from utils import get_n_trajectories
from utils import plot_y_yhat
from utils import add_three_body_features

In [8]:
train = pd.read_csv('../data/X_train.csv')
test =  pd.read_csv('../data/X_test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')#This is a file with random numbers as predictions
                                                                #dans le futur fichier résultat ne pas mettre l'index

In [9]:
train_cleaned=clean_data(train)

In [10]:
train_after_split, validation_after_split, test_after_split = split_trajectories(train_cleaned)

In [11]:
train_entry=replicate_initial_position_by_block(train_after_split)
validation_entry=replicate_initial_position_by_block(validation_after_split)
test_entry=replicate_initial_position_by_block(test_after_split)

In [13]:
X_train_enriched = add_three_body_features(train_entry)
X_val_enriched   = add_three_body_features(validation_entry)

In [15]:
input_cols_enriched = [
    't',
    'x_1', 'y_1',
    'x_2', 'y_2',
    #'x_3', 'y_3',
    #'r_12', 'r_13', 'r_23',
    #'inv_r_12', 'inv_r_13', 'inv_r_23',
    #'r12_over_r13', 'r12_over_r23', 'r13_over_r23',
    'triangle_area',
    #'angle_1','angle_2','angle_3',
    #'d1_cm','d2_cm','d3_cm',
    #'Lz'
]

X_train_enriched_selected = X_train_enriched[input_cols_enriched]
X_val_enriched_selected   = X_val_enriched[input_cols_enriched]

# Targets
target_cols = ['x_1','y_1','x_2','y_2','x_3','y_3']
y_train = train_after_split[target_cols].copy()
y_val   = validation_after_split[target_cols].copy()

# Standardisation
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train_enriched_selected)
X_val_scaled   = scaler_X.transform(X_val_enriched_selected)
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled   = scaler_y.transform(y_val)

# Valeurs de k à tester
k_values = [1,2,3,5,7,10,12,15]

results = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    knn.fit(X_train_scaled, y_train_scaled)

    y_val_pred_scaled = knn.predict(X_val_scaled)
    y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled)

    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    results.append({'k': k, 'rmse': rmse})
    print(f"k = {k}, RMSE = {rmse:.5f}")

# Meilleur k
best_result = min(results, key=lambda x: x['rmse'])
print("\n✅ Meilleur k sur validation :", best_result['k'], "avec RMSE =", best_result['rmse'])


k = 1, RMSE = 1.17407
k = 2, RMSE = 1.15334
k = 3, RMSE = 1.15500
k = 5, RMSE = 1.12546
k = 7, RMSE = 1.06433
k = 10, RMSE = 1.00568
k = 12, RMSE = 0.97880
k = 15, RMSE = 0.94887

✅ Meilleur k sur validation : 15 avec RMSE = 0.9488651324055886


In [None]:
"""Tester combinaison de variable

Les plus intéressantes après test 4,5,(4,5)
groupes = {
    0: ['t', 'x_1', 'y_1', 'x_2', 'y_2'],  # toujours présents
    1: ['r_12', 'r_13', 'r_23'],
    2: ['inv_r_12', 'inv_r_13', 'inv_r_23'],
    3: ['r12_over_r13', 'r12_over_r23', 'r13_over_r23'],
    4: ['triangle_area'],
    5: ['d1_cm', 'd2_cm', 'd3_cm']
}

# Combinaisons possibles des groupes 1 à 5, jusqu'à taille 3
groupes_optionnels = [1, 2, 3, 4, 5]
combinaisons = []
for r in range(0, min(len(groupes_optionnels), 2)+1):  # 0 à 3 groupes choisis
    for subset in itertools.combinations(groupes_optionnels, r):
        combinaisons.append(subset)

print(f"Nombre total de combinaisons testées (max 3 groupes) : {len(combinaisons)}")

# Targets
target_cols = ['x_1','y_1','x_2','y_2','x_3','y_3']
y_train = train_after_split[target_cols].copy()
y_val   = validation_after_split[target_cols].copy()

# Valeurs de k à tester
k_values = [15,20]

results = []

for combo in combinaisons:
    # Construire la liste des features à utiliser
    features = groupes[0].copy()  # toujours présents
    for g in combo:
        features += groupes[g]

    # Sélection des features
    X_train_sel = X_train_enriched[features]
    X_val_sel   = X_val_enriched[features]

    # Standardisation
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_train_scaled = scaler_X.fit_transform(X_train_sel)
    X_val_scaled   = scaler_X.transform(X_val_sel)
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_val_scaled   = scaler_y.transform(y_val)

    # Boucle sur les valeurs de k
    for k in k_values:
        knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
        knn.fit(X_train_scaled, y_train_scaled)

        y_val_pred_scaled = knn.predict(X_val_scaled)
        y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled)

        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        results.append({
            'groupes': combo,
            'features': features,
            'k': k,
            'rmse': rmse
        })
        print(f"Groupes {combo}, k={k}, RMSE={rmse:.5f}")

# Trouver le meilleur résultat
best_result = min(results, key=lambda x: x['rmse'])
print("\n✅ Meilleur modèle :")
print(f"Groupes = {best_result['groupes']}, k = {best_result['k']}, RMSE = {best_result['rmse']:.5f}")
print(f"Features utilisées : {best_result['features']}")

"""

In [None]:
X_train_enriched = add_three_body_features(train_entry)

X_test = test[['t','x0_1','y0_1','x0_2','y0_2','x0_3','y0_3']].copy()
X_test.columns = ['t','x_1','y_1','x_2','y_2','x_3','y_3'] 
X_test_enriched  = add_three_body_features(X_test)

# Colonnes à utiliser
input_cols_enriched = [
    't',
    'x_1', 'y_1',
    'x_2', 'y_2',
    #'x_3', 'y_3',
    #'r_12', 'r_13', 'r_23',
    #'inv_r_12', 'inv_r_13', 'inv_r_23',
    #'r12_over_r13', 'r12_over_r23', 'r13_over_r23',
    'triangle_area',
    #'angle_1','angle_2','angle_3',
    #'d1_cm','d2_cm','d3_cm',
    #'Lz'
]

X_train_enriched_selected = X_train_enriched[input_cols_enriched]
X_test_enriched_selected  = X_test_enriched[input_cols_enriched]

# Targets
target_cols = ['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']
y_train = train_after_split[target_cols].copy()

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train_enriched_selected)
X_test_scaled  = scaler_X.transform(X_test_enriched_selected)
y_train_scaled = scaler_y.fit_transform(y_train)

knn_model = KNeighborsRegressor(n_neighbors=50, weights='distance')
knn_model.fit(X_train_scaled, y_train_scaled)

y_test_pred_scaled = knn_model.predict(X_test_scaled)
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled)

output_df = pd.DataFrame(y_test_pred, columns=target_cols)
output_df.insert(0, 'Id', np.arange(len(output_df)))

output_df.to_csv('knn_submission.csv', index=False)
print("✅ Fichier 'knn_submission.csv' généré avec succès !")
