<a href="https://colab.research.google.com/github/PLONTZNathan/MachineLearning_Assignment1/blob/elena/DATASET_CLEANED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.compose import ColumnTransformer
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/X_train.csv'
train = pd.read_csv(file_path)
file_path_2 = '/content/drive/MyDrive/X_test.csv'
test = pd.read_csv(file_path_2)

Mounted at /content/drive


**CHECK ZERO_TRAJECTORIES ON THE DATASET X_TRAIN**

In [18]:
def check_trajectories_start_zero(df, traj_len=257):
    num_traj = len(df) // traj_len
    zero_traj = []

    for i in range(num_traj):
        start_idx = i * traj_len
        first_row = df.iloc[start_idx]

        # Controlla se tutti i valori della prima riga sono zero
        if (first_row == 0).all():
            zero_traj.append(i)

    return zero_traj

In [19]:
train_no_id = train.iloc[:, :-1]
traj_zero=check_trajectories_start_zero(train_no_id,257)

# convert trajectory indices into row indices
row_indices = [i * 257 for i in traj_zero]

print("Trajectories starting with zeros:", traj_zero)
print("Corresponding starting rows:", row_indices)

Trajectories starting with zeros: [1050, 4058]
Corresponding starting rows: [269850, 1042906]


In [20]:
def remove_zero_trajectories(df, zero_traj, traj_len=257):
    drop_indices = []

    for traj_id in zero_traj:
        start = traj_id * traj_len
        end = (traj_id + 1) * traj_len
        drop_indices.extend(range(start, end))

    df_cleaned = df.drop(drop_indices).reset_index(drop=True)
    return df_cleaned

In [22]:
train_cleaned = remove_zero_trajectories(train, traj_zero, 257)

**CHECK ANOMALIES ON THE DATASE TRAIN_CLEANED**

In [23]:
#Make sure you do not have the same initial positions in any two sets

df_t0 = train_cleaned[train_cleaned["t"] == 0].copy()
coords = ["x_1", "y_1", "x_2", "y_2", "x_3", "y_3"]
df_t0 = df_t0[~(df_t0[coords].eq(0).all(axis=1))]
duplicates = df_t0[df_t0.duplicated(subset=coords, keep=False)]
print(duplicates)#Never two times the same initial position

Empty DataFrame
Columns: [t, x_1, y_1, v_x_1, v_y_1, x_2, y_2, v_x_2, v_y_2, x_3, y_3, v_x_3, v_y_3, Id]
Index: []


In [24]:
def check_dataset_anomalies(df):
    print("Dataset anomalies check\n")

    # 1. Check for missing values (NaN)
    missing = df.isnull().sum().sum()
    if missing == 0:
        print("No missing values (NaN) found.")
    else:
        print(f"Found {missing} missing values (NaN).")
        print(df.isnull().sum()[df.isnull().sum() > 0])

    # 2. Check for infinite values
    inf_mask = df.isin([np.inf, -np.inf])
    inf_count = inf_mask.sum().sum()
    if inf_count == 0:
        print("No infinite values (+inf, -inf) found.")
    else:
        print(f"Found {inf_count} infinite values.")
        print(inf_mask.sum()[inf_mask.sum() > 0])

    # 3. Check for empty columns
    empty_cols = [col for col in df.columns if df[col].isnull().all()]
    if not empty_cols:
        print("No empty columns.")
    else:
        print(f"Found empty columns: {empty_cols}")


In [25]:
check_dataset_anomalies(train_cleaned)

Dataset anomalies check

No missing values (NaN) found.
No infinite values (+inf, -inf) found.
No empty columns.


**CHECK THE STATISTICS OF THE DATASET**

In [28]:
def dataset_summary(df):
    # Shape
    print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n")
    # Stats per column
    print("Basic statistics per column:")
    desc = df.describe().T
    print(desc[['min', 'mean', 'max']])

    # Standard deviation
    desc['std'] = df.std()
    print("\nStandard deviation:")
    print(desc['std'])

In [30]:
train_cleaned_no_id = train_cleaned.iloc[:, :-1]
dataset_summary(train_cleaned_no_id)

Shape: 1284486 rows, 13 columns

Basic statistics per column:
             min      mean        max
t       0.000000  4.146921  10.000000
x_1   -30.823004 -0.281400  16.340610
y_1   -17.857729 -0.109560  26.276226
v_x_1 -72.993333 -0.184200  23.503455
v_y_1 -23.109278 -0.016507  34.671679
x_2   -30.833398  0.007174  15.506957
y_2   -52.450808 -0.082044  35.679463
v_x_2 -25.895550  0.032490  72.718229
v_y_2 -48.289007 -0.083642  23.118346
x_3   -17.439994  0.274226  61.656402
y_3   -33.159997  0.191604  27.735539
v_x_3 -30.970540  0.151710  25.964992
v_y_3 -25.509384  0.100149  48.288738

Standard deviation:
t        3.209383
x_1      1.696094
y_1      1.167918
v_x_1    0.877679
v_y_1    0.806579
x_2      1.063197
y_2      1.038813
v_x_2    0.936416
v_y_2    0.975371
x_3      1.443690
y_3      1.683626
v_x_3    0.788051
v_y_3    0.871112
Name: std, dtype: float64


**CHECK ZERO_TRAJECTORIES ALSO ON THE DATASET X_TEST**

In [26]:
print(test.shape)

(1041621, 8)


In [27]:
#check zero_traj on test dataset
test_no_id = test.iloc[:, :-1]
traj_zero_test=check_trajectories_start_zero(test_no_id,257)

# convert trajectory indices into row indices
row_indices_test = [i * 257 for i in traj_zero_test]

print("Trajectories starting with zeros:", traj_zero_test)
print("Corresponding starting rows:", row_indices_test)
test_cleaned = remove_zero_trajectories(test, traj_zero_test, 257)

Trajectories starting with zeros: []
Corresponding starting rows: []
