In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.compose import ColumnTransformer
import time

In [51]:
train = pd.read_csv('../data/X_train.csv')
test =  pd.read_csv('../data/X_test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')#This is a file with random numbers as predictions
                                                                #dans le futur fichier résultat ne pas mettre l'index

#task 1.1

In [52]:
#Make sure you do not have the same initial positions in any two sets

df_t0 = train[train["t"] == 0].copy()
coords = ["x_1", "y_1", "x_2", "y_2", "x_3", "y_3"]
df_t0 = df_t0[~(df_t0[coords].eq(0).all(axis=1))]
duplicates = df_t0[df_t0.duplicated(subset=coords, keep=False)]

print(duplicates)#pas 2 fois la même position initiale


Empty DataFrame
Columns: [t, x_1, y_1, v_x_1, v_y_1, x_2, y_2, v_x_2, v_y_2, x_3, y_3, v_x_3, v_y_3, Id]
Index: []


In [53]:
#peut être rajouter d'autres méthodes que random
#split train en train et validation
def train_validation_split(df, train_size=0.8, validation_size=0.2, method="random"):
    traj_size=257
    if method == "random":
            
        n_traj = len(df) // traj_size  # nombre total de trajectoires
        traj_indices = np.arange(n_traj)
    
        np.random.shuffle(traj_indices)
    
        n_train = int(train_size * n_traj)
        n_val = int(validation_size * n_traj)
    
        if n_train + n_val > n_traj:
            raise ValueError("train_size + validation_size dépasse 1.0")
    
        train_traj_indices = traj_indices[:n_train]
        val_traj_indices = traj_indices[n_train:n_train+n_val]
    
        train_split = pd.concat([df.iloc[i*traj_size:(i+1)*traj_size] for i in train_traj_indices])
        validation_split = pd.concat([df.iloc[i*traj_size:(i+1)*traj_size] for i in val_traj_indices])
    
        return train_split, validation_split

In [54]:
train_after_split = train_validation_split(train)[0]
validation_after_split = train_validation_split(train)[1]

In [59]:
train_after_split.shape

(1028000, 14)

In [61]:
validation_after_split.shape

(257000, 14)

In [63]:
train_after_split.head(260)

Unnamed: 0,t,x_1,y_1,v_x_1,v_y_1,x_2,y_2,v_x_2,v_y_2,x_3,y_3,v_x_3,v_y_3,Id
979941,0.000000,1.000000,0.000000,0.000000,0.000000,-0.435092,0.585855,0.000000,0.000000,-0.564908,-0.585855,0.000000,0.000000,979941
979942,0.039062,0.999450,0.000024,-0.028166,0.001240,-0.434859,0.585189,0.011961,-0.034103,-0.564591,-0.585213,0.016206,0.032863,979942
979943,0.078125,0.997799,0.000097,-0.056408,0.002481,-0.434158,0.583189,0.023946,-0.068340,-0.563641,-0.583286,0.032462,0.065859,979943
979944,0.117188,0.995041,0.000218,-0.084800,0.003724,-0.432987,0.579847,0.035980,-0.102847,-0.562054,-0.580065,0.048821,0.099122,979944
979945,0.156250,0.991171,0.000388,-0.113423,0.004971,-0.431346,0.575149,0.048087,-0.137766,-0.559825,-0.575537,0.065335,0.132794,979945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980196,9.960940,0.027349,-0.038330,1.566768,0.150920,0.320721,-0.460265,-0.502584,-0.622000,-0.348071,0.498595,-1.064184,0.471080,980196
980197,10.000000,0.089106,-0.033636,1.593018,0.082450,0.299225,-0.481590,-0.595043,-0.466236,-0.388331,0.515226,-0.997976,0.383786,980197
63993,0.000000,1.000000,0.000000,0.000000,0.000000,-0.422640,0.856977,0.000000,0.000000,-0.577360,-0.856977,0.000000,0.000000,63993
63994,0.039062,0.999555,0.000030,-0.022789,0.001521,-0.422426,0.856578,0.010948,-0.020450,-0.577129,-0.856607,0.011841,0.018929,63994


In [65]:
validation_after_split.head(260)

Unnamed: 0,t,x_1,y_1,v_x_1,v_y_1,x_2,y_2,v_x_2,v_y_2,x_3,y_3,v_x_3,v_y_3,Id
1051387,0.000000,1.000000,0.000000,0.000000,0.000000,-0.203029,0.805403,0.000000,0.000000,-0.796971,-0.805403,0.000000,0.000000,1051387
1051388,0.039062,0.999518,0.000122,-0.024688,0.006251,-0.202816,0.804958,0.010907,-0.022810,-0.796702,-0.805080,0.013781,0.016559,1051388
1051389,0.078125,0.998071,0.000489,-0.049430,0.012518,-0.202177,0.803621,0.021842,-0.045664,-0.795894,-0.804109,0.027588,0.033146,1051389
1051390,0.117188,0.995655,0.001100,-0.074279,0.018818,-0.201109,0.801389,0.032835,-0.068607,-0.794546,-0.802490,0.041444,0.049790,1051390
1051391,0.156250,0.992265,0.001959,-0.099291,0.025167,-0.199610,0.798259,0.043915,-0.091685,-0.792655,-0.800218,0.055376,0.066518,1051391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051642,9.960940,-0.200358,0.428022,-0.076227,-0.269460,0.517997,0.042603,1.148437,0.661187,-0.317640,-0.470625,-1.072210,-0.391727,1051642
1051643,10.000000,-0.202464,0.416072,-0.032155,-0.341368,0.561218,0.068553,1.065608,0.667261,-0.358754,-0.484625,-1.033453,-0.325893,1051643
76843,0.000000,1.000000,0.000000,0.000000,0.000000,-0.443024,0.362429,0.000000,0.000000,-0.556976,-0.362429,0.000000,0.000000,76843
76844,0.039062,0.999375,0.000016,-0.032021,0.000832,-0.442910,0.360943,0.005824,-0.076162,-0.556465,-0.360959,0.026197,0.075331,76844


In [67]:
number_lines=train.shape[0]
time_interval=257
index_begin_trajectory=np.arange(0, number_lines,time_interval)
index_begin_trajectory

array([      0,     257,     514, ..., 1284229, 1284486, 1284743])

In [69]:
#donne l'index du point initial d'un index
def get_index_initial_point(id, idx=index_begin_trajectory):
    initial_idx = max(i for i in idx if i <= id)
    return initial_idx

In [71]:
get_index_initial_point(800)

771

#task 1.2

In [73]:
def replicate_initial_position_by_block(df):
    
    coords = ["x_1", "y_1", "v_x_1", "v_y_1", "x_2", "y_2", "v_x_2", "v_y_2", "x_3", "y_3", "v_x_3", "v_y_3"] 
    block_size = 257 
    n = len(df) 
    train_after_split_copy = df.copy(deep=True) 
    
    for i in range(0, n, block_size): 
        end_idx = min(i + block_size, n) 
        first_row = train_after_split_copy.iloc[i][coords].values 
        train_after_split_copy.iloc[i:end_idx, train_after_split_copy.columns.get_indexer(coords)] = first_row
    
    return train_after_split_copy

In [75]:
train_entry=replicate_initial_position_by_block(train_after_split)

In [77]:
train_entry.head(260)

Unnamed: 0,t,x_1,y_1,v_x_1,v_y_1,x_2,y_2,v_x_2,v_y_2,x_3,y_3,v_x_3,v_y_3,Id
979941,0.000000,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,979941
979942,0.039062,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,979942
979943,0.078125,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,979943
979944,0.117188,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,979944
979945,0.156250,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,979945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980196,9.960940,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,980196
980197,10.000000,1.0,0.0,0.0,0.0,-0.435092,0.585855,0.0,0.0,-0.564908,-0.585855,0.0,0.0,980197
63993,0.000000,1.0,0.0,0.0,0.0,-0.422640,0.856977,0.0,0.0,-0.577360,-0.856977,0.0,0.0,63993
63994,0.039062,1.0,0.0,0.0,0.0,-0.422640,0.856977,0.0,0.0,-0.577360,-0.856977,0.0,0.0,63994


In [81]:
train_after_split.head(260)

Unnamed: 0,t,x_1,y_1,v_x_1,v_y_1,x_2,y_2,v_x_2,v_y_2,x_3,y_3,v_x_3,v_y_3,Id
979941,0.000000,1.000000,0.000000,0.000000,0.000000,-0.435092,0.585855,0.000000,0.000000,-0.564908,-0.585855,0.000000,0.000000,979941
979942,0.039062,0.999450,0.000024,-0.028166,0.001240,-0.434859,0.585189,0.011961,-0.034103,-0.564591,-0.585213,0.016206,0.032863,979942
979943,0.078125,0.997799,0.000097,-0.056408,0.002481,-0.434158,0.583189,0.023946,-0.068340,-0.563641,-0.583286,0.032462,0.065859,979943
979944,0.117188,0.995041,0.000218,-0.084800,0.003724,-0.432987,0.579847,0.035980,-0.102847,-0.562054,-0.580065,0.048821,0.099122,979944
979945,0.156250,0.991171,0.000388,-0.113423,0.004971,-0.431346,0.575149,0.048087,-0.137766,-0.559825,-0.575537,0.065335,0.132794,979945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980196,9.960940,0.027349,-0.038330,1.566768,0.150920,0.320721,-0.460265,-0.502584,-0.622000,-0.348071,0.498595,-1.064184,0.471080,980196
980197,10.000000,0.089106,-0.033636,1.593018,0.082450,0.299225,-0.481590,-0.595043,-0.466236,-0.388331,0.515226,-0.997976,0.383786,980197
63993,0.000000,1.000000,0.000000,0.000000,0.000000,-0.422640,0.856977,0.000000,0.000000,-0.577360,-0.856977,0.000000,0.000000,63993
63994,0.039062,0.999555,0.000030,-0.022789,0.001521,-0.422426,0.856578,0.010948,-0.020450,-0.577129,-0.856607,0.011841,0.018929,63994
