<a href="https://colab.research.google.com/github/PLONTZNathan/MachineLearning_Assignment1/blob/elena/DATASET_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.compose import ColumnTransformer
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/X_train.csv'
train = pd.read_csv(file_path)
file_path_2 = '/content/drive/MyDrive/X_test.csv'
test = pd.read_csv(file_path_2)

Mounted at /content/drive


**DATASET WITHOUT ALL THE COLLISIONS**

In [22]:
def remove_collision(df, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end]

        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero = zero_mask.argmax() + start
            traj = traj.loc[start:first_zero-1]

        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df



In [23]:
train_no_collisions=remove_collision(train, 257, 1e-8)

In [24]:
print(train.shape)
print(train_no_collisions.shape)

(1285000, 14)
(1089790, 14)


**DATASET THAT KEEP JUST THE FIRST ROW OF COLLISIONS**

In [33]:
def get_time_vector(df, traj_len=257, tol=1e-8):

    num_traj = len(df) // traj_len
    feats = df.iloc[:, 1:-1].to_numpy()   # tutte le feature tranne tempo e Id

    for i in range(num_traj):
        start = i * traj_len
        end = start + traj_len

        block_feats = feats[start:end]
        zero_mask = np.all(np.abs(block_feats) < tol, axis=1)

        if not zero_mask.any():  # nessuna collisione
            return df.iloc[start:end, 0].to_numpy()  # solo la colonna tempo

    return None  # se tutte collidono


In [35]:
time_vec= get_time_vector(train, traj_len=257)
print(time_vec)

[ 0.         0.0390625  0.078125   0.117188   0.15625    0.195312
  0.234375   0.273438   0.3125     0.351562   0.390625   0.429688
  0.46875    0.507812   0.546875   0.585938   0.625      0.664062
  0.703125   0.742188   0.78125    0.820312   0.859375   0.898438
  0.9375     0.976562   1.01562    1.05469    1.09375    1.13281
  1.17188    1.21094    1.25       1.28906    1.32812    1.36719
  1.40625    1.44531    1.48438    1.52344    1.5625     1.60156
  1.64062    1.67969    1.71875    1.75781    1.79688    1.83594
  1.875      1.91406    1.95312    1.99219    2.03125    2.07031
  2.10938    2.14844    2.1875     2.22656    2.26562    2.30469
  2.34375    2.38281    2.42188    2.46094    2.5        2.53906
  2.57812    2.61719    2.65625    2.69531    2.73438    2.77344
  2.8125     2.85156    2.89062    2.92969    2.96875    3.00781
  3.04688    3.08594    3.125      3.16406    3.20312    3.24219
  3.28125    3.32031    3.35938    3.39844    3.4375     3.47656
  3.51562    3.55469 

In [69]:
def keep_first_zero_time(df, time_vec, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end].copy()

        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero_rel = int(np.argmax(zero_mask))   # posizione relativa
            # change the t=0 to t=time_vec[i]
            traj.iat[first_zero_rel, 0] = time_vec[first_zero_rel]
            traj = traj.iloc[:first_zero_rel + 1]

        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df


In [86]:
train_first_row_time=keep_first_zero_time(train, time_vec, traj_len=257, tol=1e-8)

In [87]:
print(train_first_row_time[518:522])

            t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
518  0.156250  0.989271  0.000104 -0.138188  0.001097 -0.476416  0.064174   
519  0.195312  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
520  0.000000  1.000000  0.000000  0.000000  0.000000 -0.081487  0.726750   
521  0.039062  0.999457  0.000186 -0.027789  0.009550 -0.081250  0.726264   

        v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  
518 -0.515444 -2.106407 -0.512855 -0.064278  0.653633  2.105310  518  
519  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  519  
520  0.000000  0.000000 -0.918513 -0.726750  0.000000  0.000000  771  
521  0.012175 -0.024874 -0.918208 -0.726450  0.015613  0.015324  772  


In [84]:
print(train.shape)
print(train_first_row_time.shape)

(1285000, 14)
(1090737, 14)


In [77]:
#without changing the time
def remove_collision_keep_first_zero(df, traj_len=257, tol=1e-8):

    cleaned = []
    num_traj = len(df) // traj_len

    for i in range(num_traj):
        start = i * traj_len
        end = (i + 1) * traj_len
        traj = df.iloc[start:end]

        traj_features = traj.iloc[:, :-1]

        zero_mask = (np.abs(traj_features.values) < tol).all(axis=1)

        if zero_mask.any():
            first_zero = zero_mask.argmax() + start
            traj = traj.loc[start:first_zero]  # keep up to the first zero row
        cleaned.append(traj)

    cleaned_df = pd.concat(cleaned).reset_index(drop=True)
    return cleaned_df


In [78]:
train_first_row=remove_collision_keep_first_zero(train, 257, 1e-8)

In [82]:
print(train_first_row[518:522])

            t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
518  0.156250  0.989271  0.000104 -0.138188  0.001097 -0.476416  0.064174   
519  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
520  0.000000  1.000000  0.000000  0.000000  0.000000 -0.081487  0.726750   
521  0.039062  0.999457  0.000186 -0.027789  0.009550 -0.081250  0.726264   

        v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  
518 -0.515444 -2.106407 -0.512855 -0.064278  0.653633  2.105310  518  
519  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  519  
520  0.000000  0.000000 -0.918513 -0.726750  0.000000  0.000000  771  
521  0.012175 -0.024874 -0.918208 -0.726450  0.015613  0.015324  772  


In [79]:
print(train.shape)
print(train_first_row.shape)

(1285000, 14)
(1090737, 14)
