### Import Packages


In [195]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import math
import random

### Global attributes

In [196]:
training_dataroot = 'lab1_basic_training.csv' # Training data file file named as 'lab1_basic_training.csv'
testing_dataroot = 'lab1_basic_testing.csv'   # Testing data file named as 'lab1_basic_testing.csv'
output_dataroot = 'lab1_basic.csv' # Output file will be named as 'lab1_basic.csv'

training_datalist =  [] # Training datalist, saved as numpy array
testing_datalist =  [] # Testing datalist, saved as numpy array

output_datalist =  [] # Your prediction, should be a list with 100 elements

### Load the Input File

In [197]:
# Read input csv to datalist
with open(training_dataroot, newline='') as csvfile:
  training_datalist = pd.read_csv(training_dataroot).to_numpy()

with open(testing_dataroot, newline='') as csvfile:
  testing_datalist = pd.read_csv(testing_dataroot).to_numpy()

### Implement the Regression Model

#### Step 1: Split Data
Split data in *training_datalist* into training dataset and validation dataset


In [198]:
def SplitData(data, split_ratio):
    """
    Splits the given dataset into training and validation sets based on the specified split ratio.

    Parameters:
    - data (numpy.ndarray): The dataset to be split. It is expected to be a 2D array where each row represents a data point and each column represents a feature.
    - split_ratio (float): The ratio of the data to be used for training. For example, a value of 0.8 means 80% of the data will be used for training and the remaining 20% for validation.

    Returns:
    - training_data (numpy.ndarray): The portion of the dataset used for training.
    - validation_data (numpy.ndarray): The portion of the dataset used for validation.

    """
    training_data = []
    validation_data = []

    # TODO
    # Shuffle the data to ensure randomness
    np.random.shuffle(data)

    # Calculate the split index based on the ratio
    split_index = int(len(data) * split_ratio)

    # Split the data into training and validation sets
    training_data = data[:split_index]
    validation_data = data[split_index:]

    return training_data, validation_data



#### Step 2: Preprocess Data
Handle unreasonable data and missing data

> Hint 1: Outliers and missing data can be addressed by either removing them or replacing them using statistical methods (e.g., the mean of all data).

> Hint 2: Missing data are represented as `np.nan`, so functions like `np.isnan()` can be used to detect them.

> Hint 3: Methods such as the Interquartile Range (IQR) can help detect outliers

In [199]:
def PreprocessData(data):
    """
    Preprocess the given dataset and return the result.

    Parameters:
    - data (numpy.ndarray): The dataset to preprocess. It is expected to be a 2D array where each row represents a data point and each column represents a feature.

    Returns:
    - preprocessedData (numpy.ndarray): Preprocessed data.
    """

    preprocessedData = []

    # -----------------------------------------------------------------------------------
    # only for advance
    # # Handle gender column (second column)
    # mapping = {'F': 1, 'M': 2}

    # # Transform gender values
    # gender_column = data[:, 1]  # Extract the gender column
    # # Apply mapping: map 'F' to 1, 'M' to 2, and leave np.nan unchanged
    # gender_column_mapped = np.where(gender_column == 'F', 1, np.where(gender_column == 'M', 2, gender_column))
    # data[:, 1] = gender_column_mapped  # Update the dataset with mapped values

    # -----------------------------------------------------------------------------------

    # Convert data to a float type to handle np.nan and numerical operations
    data = data.astype(float)

    # Handle missing data: Replace np.nan with the mean of each column
    col_means = np.nanmean(data, axis=0)  # Compute the mean of each column ignoring np.nan
    inds = np.where(np.isnan(data))  # Find indices of np.nan values
    data[inds] = np.take(col_means, inds[1])  # Replace np.nan with the column mean

    # Handle outliers using Interquartile Range (IQR)
    def remove_outliers(data):
        # Calculate IQR for each column
        Q1 = np.percentile(data, 25, axis=0)
        Q3 = np.percentile(data, 75, axis=0)
        IQR = Q3 - Q1

        # Define bounds for outliers
        lower_bound = Q1 - 2 * IQR
        upper_bound = Q3 + 2 * IQR

        # Find rows where all features are within bounds
        within_bounds = np.all((data >= lower_bound) & (data <= upper_bound), axis=1)

        # Return the data without outliers
        return data[within_bounds]

    # Remove outliers
    preprocessedData = remove_outliers(data)

    return preprocessedData

### Step 3: Implement Regression

In [200]:
def Regression(dataset, degree):
    """
    Performs regression on the given dataset and return the coefficients.

    Parameters:
    - dataset (numpy.ndarray): A 2D array where each row represents a data point.

    Returns:
    - w (numpy.ndarray): The coefficients of the regression model. For example, y = w[0] + w[1] * x + w[2] * x^2 + ...
    """

    X = dataset[:, :-1] # All columns except the last one (features)
    y = dataset[:, -1] # Last column (target)

    # Standardize the training data (mean and std are for later use)
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X = (X - mean) / std

    # Add polynomial features to X
    X_poly = np.ones((X.shape[0], 1))  # Add intercept term (column of ones)
    for d in range(1, degree + 1):
        X_poly = np.hstack((X_poly, X ** d))  # Add x^d terms to feature matrix

    # Initialize coefficients (weights) to zero
    num_dimensions = X_poly.shape[1]  # Number of features (including intercept and polynomial terms)
    print(f"num_dimensions:{num_dimensions}")
    w = np.zeros(num_dimensions)
    print(f"X_poly.shape:{X_poly.shape}")

    # TODO: Set hyperparameters
    if degree == 2 :
      num_iteration = 30000
      learning_rate = 0.002
    if degree == 1 :
      num_iteration = 500
      learning_rate = 0.01

    # Gradient Descent
    m = len(y)  # Number of data points
    for iteration in range(num_iteration):
        # TODO: Prediction using current weights and compute error
        y_pred = np.dot(X_poly, w)  # Prediction using current weights
        error = y_pred - y  # Compute the error (residuals)

        # TODO: Compute gradient
        gradient = (1/m) * np.dot(X_poly.T, error)

        # TODO: Update the weights
        w = w - learning_rate * gradient

        # Compute the cost (Mean Squared Error)
        cost = (1/(2*m)) * np.sum(error ** 2)

        # Optionally, print the cost every 100 iterations
        if iteration % 100 == 0:
            print(f"Iteration {iteration}, Cost: {cost}")

    return w, mean, std  # Return the weights, mean, and std


### Step 4: Make Prediction

In [201]:

def MakePrediction(w, test_dataset, degree):
    """
    Predicts the output for a given test dataset using a regression model.

    Parameters:
    - w (numpy.ndarray): The coefficients of the model, where each element corresponds to
                               a coefficient for the respective power of the independent variable.
    - test_dataset (numpy.ndarray): A 1D array containing the input values (independent variable)
                                          for which predictions are to be made.

    Returns:
    - list/numpy.ndarray: A list or 1d array of predicted values corresponding to each input value in the test dataset.
    """
    prediction = []

    # TODO

    # Create a matrix with polynomial features for the test dataset
    X_poly_test = np.ones((test_dataset.shape[0], 1))  # Start with a column of ones for the intercept term
    for d in range(1, degree + 1):
        X_poly_test = np.hstack((X_poly_test, test_dataset ** d))  # Add x^d terms

    # Compute the predictions using the model's coefficients
    prediction = np.dot(X_poly_test, w)  # dot product of the feature matrix and the weights
    return prediction


### Step 5: Train Model and Generate Result

Use the above functions to train your model on training dataset, and predict the answer of testing dataset.

Save your predicted values in `output_datalist`



In [None]:
# TODO
split_ratio = 0.8  # For example, 80% for training and 20% for validation
degree = 1         # Polynomial degree
# (1) Split data
training_data, validation_data = SplitData(training_datalist, split_ratio)

# (2) Preprocess data
training_data = PreprocessData(training_data)
validation_data = PreprocessData(validation_data)

# (3) Train regression model
w, mean, std = Regression(training_data, degree)

# (4) Predict validation dataset's answer, calculate MAPE comparing to the ground truth
def MAPE(y_true, y_pred):
    """
    Calculates Mean Absolute Percentage Error (MAPE) between the true and predicted values.

    Parameters:
    - y_true (numpy.ndarray): Actual values (ground truth).
    - y_pred (numpy.ndarray): Predicted values.

    Returns:
    - float: MAPE value as a percentage.
    """

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

X_val = validation_data[:, :1]  # All columns except the last one (features)
X_val = (X_val - mean) / std  # Standardize features
y_val = validation_data[:, 1]   # Last column (target)

# Make predictions on validation data
y_val_pred = MakePrediction(w, X_val, degree)

# Calculate MAPE
mape = MAPE(y_val, y_val_pred)
print(f"MAPE on validation data: {mape}%")

# (5) Make prediction of testing dataset and store the values in output_datalist

# Standardize using training mean and std
X_test = (testing_datalist - mean) / std

# Make predictions on the standardized testing data
output_datalist = MakePrediction(w, X_test, degree)


### *Write the Output File*

Write the prediction to output csv and upload the file to Kaggle
> Format: 'Id', 'gripForce'


In [203]:
# Assume that output_datalist is a list (or 1d array) with length = 100

with open(output_dataroot, 'w', newline='', encoding="utf-8") as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['Id', 'gripForce'])
  for i in range(len(output_datalist)):
    writer.writerow([i,output_datalist[i]])


# 2. Advanced Part (45%)
In the second part, you need to implement regression differently from the basic part to improve your grip force predictions. You must use more than two features.

You can choose either matrix inversion or gradient descent for this part

We have provided `lab1_advanced_training.csv` for your training

> Notice: Be cautious of the "gender" attribute, as it is represented by "F"/"M" rather than a numerical value.

Please save the prediction result in a CSV file and submit it to Kaggle

In [204]:
training_dataroot = 'lab1_advanced_training.csv' # Training data file file named as 'lab1_advanced_training.csv'
testing_dataroot = 'lab1_advanced_testing.csv'   # Testing data file named as 'lab1_advanced_testing.csv'
output_dataroot = 'lab1_advanced.csv' # Output file will be named as 'lab1_advanced.csv'

training_datalist =  [] # Training datalist, saved as numpy array
testing_datalist =  [] # Testing datalist, saved as numpy array

output_datalist =  [] # Your prediction, should be a list with 3000 elements

In [205]:
# Read input csv to datalist
with open(training_dataroot, newline='') as csvfile:
  training_datalist = pd.read_csv(training_dataroot).to_numpy()

with open(testing_dataroot, newline='') as csvfile:
  testing_datalist = pd.read_csv(testing_dataroot).to_numpy()

In [None]:
# TODO
split_ratio = 0.8  # For example, 80% for training and 20% for validation
degree = 2         # Polynomial degree
# (1) Split data
training_data, validation_data = SplitData(training_datalist, split_ratio)

# (2) Preprocess data
mapping = {'F': 1, 'M': 2}

# Transform gender values
gender_column = training_data[:, 1]  # Extract the gender column
# Apply mapping: map 'F' to 1, 'M' to 2, and leave np.nan unchanged
gender_column_mapped = np.where(gender_column == 'F', 1, np.where(gender_column == 'M', 2, gender_column))
training_data[:, 1] = gender_column_mapped  # Update the dataset with mapped values

# Transform gender values
gender_column = validation_data[:, 1]  # Extract the gender column
# Apply mapping: map 'F' to 1, 'M' to 2, and leave np.nan unchanged
gender_column_mapped = np.where(gender_column == 'F', 1, np.where(gender_column == 'M', 2, gender_column))
validation_data[:, 1] = gender_column_mapped  # Update the dataset with mapped values

training_data = PreprocessData(training_data)
validation_data = PreprocessData(validation_data)

# (3) Train regression model
w, mean, std = Regression(training_data, degree)

# (4) Predict validation dataset's answer, calculate MAPE comparing to the ground truth
def MAPE(y_true, y_pred):
    """
    Calculates Mean Absolute Percentage Error (MAPE) between the true and predicted values.

    Parameters:
    - y_true (numpy.ndarray): Actual values (ground truth).
    - y_pred (numpy.ndarray): Predicted values.

    Returns:
    - float: MAPE value as a percentage.
    """

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# (4) Predict validation dataset's answer, calculate MAPE comparing to the ground truth
X_val = validation_data[:, :-1]  # All columns except the last one (features)
X_val = (X_val - mean) / std  # Standardize features
y_val = validation_data[:, -1]   # Last column (target)

# Make predictions on validation data
y_val_pred = MakePrediction(w, X_val, degree)

# Calculate MAPE
mape = MAPE(y_val, y_val_pred)
print(f"MAPE on validation data: {mape}%")

# (5) Make prediction of testing dataset and store the values in output_datalist

# Transform gender values
gender_column = testing_datalist[:, 1]  # Extract the gender column
# Apply mapping: map 'F' to 1, 'M' to 2, and leave np.nan unchanged
gender_column_mapped = np.where(gender_column == 'F', 1, np.where(gender_column == 'M', 2, gender_column))
testing_datalist[:, 1] = gender_column_mapped  # Update the dataset with mapped values

# Standardize using training mean and std
X_test = (testing_datalist - mean) / std

# Make predictions on the standardized testing data
output_datalist = MakePrediction(w, X_test, degree)

# Save the Code File
Please save your code and submit it as an ipynb file! (**Lab1.ipynb**)