In [None]:
from google.colab import drive

# Mounting the Google Drive to save processed files
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to load and concatenate the datasets
def load_and_concat_data(file_paths):
    """
    Load and concatenate datasets from the given file paths.
    :param file_paths: List of file paths for datasets
    :return: Concatenated features (X) and targets (y)
    """
    data = [pd.read_csv(file, header=None) for file in file_paths]
    combined_data = pd.concat(data, axis=0).reset_index(drop=True)
    # Features: In all the columns except for the last two
    X = combined_data.iloc[:, :-2]
    # Targets: In the last two columns
    y = combined_data.iloc[:, -2:]
    return X, y

# Function to preprocess the data
def preprocess_data(X_train, X_val, X_test):
    """
    Normalize the datasets using StandardScaler.
    :param X_train: Training features
    :param X_val: Validation features
    :param X_test: Test features
    :return: Scaled datasets and the scaler object
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

In [None]:

# Function to load and concatenate data from multiple files
def load_and_concat_data(file_paths):
    X_list, y_list = [], []
    for file in file_paths:
        data = pd.read_csv(file, header=None)
        data = data.apply(pd.to_numeric, errors='coerce').fillna(data.mean())  # Handling all the non-numeric and NaN values
        X_list.append(data.iloc[:, :-2].values)
        y_list.append(data.iloc[:, -2:].values)
    X = np.vstack(X_list)
    y = np.vstack(y_list)
    return X, y

# Function to preprocess and scale the data
def preprocess_data(X_train, X_val, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

# Make sure to change the paths as per your file directory
# File paths for the training and the test datasets
train_files = [
    "/content/drive/MyDrive/ML/Train/Aug14_Box_g17.csv",
    "/content/drive/MyDrive/ML/Train/July22_23.csv",
    "/content/drive/MyDrive/ML/Train/July28_Special_2.csv"
]
test_files = [
    "/content/drive/MyDrive/ML/Test/Aug14_Box_g11.csv",
    "/content/drive/MyDrive/ML/Test/July22_68.csv"
]

# Loading and concatenating the datasets
X_train_full, y_train_full = load_and_concat_data(train_files)
X_test_full, y_test_full = load_and_concat_data(test_files)

# Shuffling the data before splitting
shuffle_indices = np.random.permutation(len(X_train_full))
X_train_full = X_train_full[shuffle_indices]
y_train_full = y_train_full[shuffle_indices]

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, shuffle=False)

# Preprocessing the data
X_train_scaled, X_val_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_val, X_test_full)

# Printing out the data shapes for verification
print(f"Training features: {X_train_scaled.shape}, Training targets: {y_train.shape}")
print(f"Validation features: {X_val_scaled.shape}, Validation targets: {y_val.shape}")
print(f"Test features: {X_test_scaled.shape}, Test targets: {y_test_full.shape}")

Training features: (56289, 1092), Training targets: (56289, 2)
Validation features: (14073, 1092), Validation targets: (14073, 2)
Test features: (39804, 1092), Test targets: (39804, 2)


In [None]:
# Saving the scaled datasets
np.save(save_dir + "X_train_scaled.npy", X_train_scaled)
np.save(save_dir + "X_val_scaled.npy", X_val_scaled)
np.save(save_dir + "X_test_scaled.npy", X_test_scaled)

# Saving the target arrays (already NumPy arrays, no need for .to_numpy())
np.save(save_dir + "y_train.npy", y_train)
np.save(save_dir + "y_val.npy", y_val)
np.save(save_dir + "y_test.npy", y_test_full)

In [None]:
# Listing  the saved files in the Google Drive directory
import os
files = os.listdir(save_dir)
print("Saved files in Google Drive:")
print(files)

Saved files in Google Drive:
['X_train_scaled.npy', 'X_val_scaled.npy', 'X_test_scaled.npy', 'y_train.npy', 'y_val.npy', 'y_test.npy', 'scaler.pkl']
