<a href="https://colab.research.google.com/github/PLONTZNathan/MachineLearning_Assignment1/blob/elena/DATASET_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.compose import ColumnTransformer
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/X_train.csv'
train = pd.read_csv(file_path)
file_path_2 = '/content/drive/MyDrive/X_test.csv'
test = pd.read_csv(file_path_2)

Mounted at /content/drive


**DATASET WITHOUT ALL THE COLLISIONS**

In [22]:
def remove_collision(df, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end]

        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero = zero_mask.argmax() + start
            traj = traj.loc[start:first_zero-1]

        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df



In [23]:
train_no_collisions=remove_collision(train, 257, 1e-8)

In [24]:
print(train.shape)
print(train_no_collisions.shape)

(1285000, 14)
(1089790, 14)


**DATASET THAT KEEP JUST THE FIRST ROW OF COLLISIONS**

In [33]:
def get_time_vector(df, traj_len=257, tol=1e-8):

    num_traj = len(df) // traj_len
    feats = df.iloc[:, 1:-1].to_numpy()   # tutte le feature tranne tempo e Id

    for i in range(num_traj):
        start = i * traj_len
        end = start + traj_len

        block_feats = feats[start:end]
        zero_mask = np.all(np.abs(block_feats) < tol, axis=1)

        if not zero_mask.any():  # nessuna collisione
            return df.iloc[start:end, 0].to_numpy()  # solo la colonna tempo

    return None  # se tutte collidono


In [35]:
time_vec= get_time_vector(train, traj_len=257)
print(time_vec)

[ 0.         0.0390625  0.078125   0.117188   0.15625    0.195312
  0.234375   0.273438   0.3125     0.351562   0.390625   0.429688
  0.46875    0.507812   0.546875   0.585938   0.625      0.664062
  0.703125   0.742188   0.78125    0.820312   0.859375   0.898438
  0.9375     0.976562   1.01562    1.05469    1.09375    1.13281
  1.17188    1.21094    1.25       1.28906    1.32812    1.36719
  1.40625    1.44531    1.48438    1.52344    1.5625     1.60156
  1.64062    1.67969    1.71875    1.75781    1.79688    1.83594
  1.875      1.91406    1.95312    1.99219    2.03125    2.07031
  2.10938    2.14844    2.1875     2.22656    2.26562    2.30469
  2.34375    2.38281    2.42188    2.46094    2.5        2.53906
  2.57812    2.61719    2.65625    2.69531    2.73438    2.77344
  2.8125     2.85156    2.89062    2.92969    2.96875    3.00781
  3.04688    3.08594    3.125      3.16406    3.20312    3.24219
  3.28125    3.32031    3.35938    3.39844    3.4375     3.47656
  3.51562    3.55469 

In [None]:
import numpy as np
import pandas as pd

def keep_first_zero_time(df, time_vec, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end].copy()  # copia per non modificare df originale

        # features escludendo Id
        traj_features = traj.iloc[:, :-1]

        # maschera collisione
        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero_rel = int(np.argmax(zero_mask))   # posizione relativa
            # sostituisco la colonna tempo con il valore da time_vec
            traj.iat[first_zero_rel, 0] = time_vec[first_zero_rel]
            # tengo solo fino a quella riga
            traj = traj.iloc[:first_zero_rel + 1]

        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df


In [46]:
def keep_first_zero_time(df, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end].copy()
        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero_rel = int(np.argmax(zero_mask))
            first_zero_abs = start + first_zero_rel

            time_vec = traj.iloc[:, 0].to_numpy()

            traj.iat[first_zero_rel, 0] = time_vec[first_zero_rel]

            traj = traj.iloc[:first_zero_rel + 1]

        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df


In [47]:
train_first_row_time=keep_first_zero_time(train, 257, 1e-8)

In [48]:
print(train_first_row_time.shape)

(1090737, 14)


In [58]:
def get_colliding_trajectory_with_row(df, traj_len=257, tol=1e-8):

    num_traj = len(df) // traj_len
    feats = df.iloc[:, :-1].to_numpy()  # tutte le colonne tranne l'ultima (Id)

    for traj_id in range(num_traj):
        start = traj_id * traj_len
        end = start + traj_len

        block_feats = feats[start:end]
        zero_mask = np.all(np.abs(block_feats) < tol, axis=1)

        if zero_mask.any():  # se almeno una riga è tutta zero
            first_zero_rel = int(np.argmax(zero_mask))     # indice relativo
            first_zero_abs = start + first_zero_rel        # indice assoluto nel df
            traj = df.iloc[start:end].reset_index(drop=True)
            return traj, traj_id, first_zero_rel, first_zero_abs

    return None, None, None, None



In [60]:
traj, traj_id, first_zero_rel, first_zero_abs = get_colliding_trajectory_with_row(train, traj_len=257, tol=1e-8)

In [61]:
print(first_zero_abs)

519


In [65]:
print(train.iloc[500:531])



             t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
500   9.492190 -6.348590  3.055122 -0.539981  0.291640  3.203337 -1.388399   
501   9.531250 -6.369671  3.066509 -0.539353  0.291338  3.168440 -1.413876   
502   9.570310 -6.390727  3.077883 -0.538729  0.291038  3.138336 -1.465591   
503   9.609380 -6.411759  3.089246 -0.538109  0.290739  3.139067 -1.548962   
504   9.648440 -6.432767  3.100597 -0.537494  0.290443  3.209798 -1.619296   
505   9.687500 -6.453751  3.111937 -0.536882  0.290148  3.296349 -1.622997   
506   9.726560 -6.474711  3.123265 -0.536275  0.289855  3.356234 -1.595250   
507   9.765620 -6.495647  3.134582 -0.535671  0.289564  3.391998 -1.559624   
508   9.804690 -6.516560  3.145887 -0.535072  0.289274  3.409308 -1.524644   
509   9.843750 -6.537450  3.157181 -0.534477  0.288987  3.412052 -1.494181   
510   9.882810 -6.558316  3.168464 -0.533886  0.288701  3.402849 -1.470691   
511   9.921880 -6.579160  3.179736 -0.533299  0.288417  3.383651

In [66]:
print(train_first_row.iloc[500:531])


             t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
500   9.492190 -6.348590  3.055122 -0.539981  0.291640  3.203337 -1.388399   
501   9.531250 -6.369671  3.066509 -0.539353  0.291338  3.168440 -1.413876   
502   9.570310 -6.390727  3.077883 -0.538729  0.291038  3.138336 -1.465591   
503   9.609380 -6.411759  3.089246 -0.538109  0.290739  3.139067 -1.548962   
504   9.648440 -6.432767  3.100597 -0.537494  0.290443  3.209798 -1.619296   
505   9.687500 -6.453751  3.111937 -0.536882  0.290148  3.296349 -1.622997   
506   9.726560 -6.474711  3.123265 -0.536275  0.289855  3.356234 -1.595250   
507   9.765620 -6.495647  3.134582 -0.535671  0.289564  3.391998 -1.559624   
508   9.804690 -6.516560  3.145887 -0.535072  0.289274  3.409308 -1.524644   
509   9.843750 -6.537450  3.157181 -0.534477  0.288987  3.412052 -1.494181   
510   9.882810 -6.558316  3.168464 -0.533886  0.288701  3.402849 -1.470691   
511   9.921880 -6.579160  3.179736 -0.533299  0.288417  3.383651

In [67]:
print(train_first_row_time.iloc[500:531])

             t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
500   9.492190 -6.348590  3.055122 -0.539981  0.291640  3.203337 -1.388399   
501   9.531250 -6.369671  3.066509 -0.539353  0.291338  3.168440 -1.413876   
502   9.570310 -6.390727  3.077883 -0.538729  0.291038  3.138336 -1.465591   
503   9.609380 -6.411759  3.089246 -0.538109  0.290739  3.139067 -1.548962   
504   9.648440 -6.432767  3.100597 -0.537494  0.290443  3.209798 -1.619296   
505   9.687500 -6.453751  3.111937 -0.536882  0.290148  3.296349 -1.622997   
506   9.726560 -6.474711  3.123265 -0.536275  0.289855  3.356234 -1.595250   
507   9.765620 -6.495647  3.134582 -0.535671  0.289564  3.391998 -1.559624   
508   9.804690 -6.516560  3.145887 -0.535072  0.289274  3.409308 -1.524644   
509   9.843750 -6.537450  3.157181 -0.534477  0.288987  3.412052 -1.494181   
510   9.882810 -6.558316  3.168464 -0.533886  0.288701  3.402849 -1.470691   
511   9.921880 -6.579160  3.179736 -0.533299  0.288417  3.383651

In [52]:
print(traj)

(            t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0    0.000000  1.000000  0.000000  0.000000  0.000000 -0.450789  0.177112   
1    0.039062  0.999333  0.000008 -0.034141  0.000384 -0.451961  0.171573   
2    0.078125  0.997331  0.000029 -0.068445  0.000723 -0.455692  0.154201   
3    0.117188  0.993982  0.000063 -0.103077  0.000973 -0.462828  0.121989   
4    0.156250  0.989271  0.000104 -0.138188  0.001097 -0.476416  0.064174   
..        ...       ...       ...       ...       ...       ...       ...   
252  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
253  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
254  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
255  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
256  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

        v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  
0  

In [57]:
print(train.head(258))


             t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0     0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1     0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2     0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3     0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4     0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   
..         ...       ...       ...       ...       ...       ...       ...   
253   9.882810  1.483859  1.154101  0.249804  0.551574  0.978161  1.249428   
254   9.921880  1.490718  1.176133  0.099868  0.575697  0.995286  1.249430   
255   9.960940  1.491492  1.199018 -0.063145  0.595242  1.018416  1.248499   
256  10.000000  1.485476  1.222565 -0.250131  0.609230  1.048258  1.246830   
257   0.000000  1.000000  0.000000  0.000000  0.000000 -0.176502  0.555739   

        v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3

In [25]:
def remove_collision_keep_first_zero(df, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end]

        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero = zero_mask.argmax() + start
            traj = traj.loc[start:first_zero]  # keep up to the first zero row
        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df


In [26]:
train_first_row=remove_collision_keep_first_zero(train, 257, 1e-8)

In [27]:
print(train.shape)
print(train_first_row.shape)

(1285000, 14)
(1090737, 14)
