# Non-Linear Regression

In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Auxiliar functions

In [123]:
def sign(x):
    """Calculates the sign of a number."""
    if x > 0:
        return 1  
    elif x == 0:
        return 0
    else:
        return -1

def create_database(min_value, max_value, d, number_of_points):
    """Creates a database of random points with a specified range and number of dimensions."""
    return np.random.uniform(min_value, max_value, (number_of_points, d + 2))

def get_target_points(min_value, max_value, d):
    """Generates random points representing the endpoints of a line."""
    return np.random.uniform(min_value, max_value, (2, d))

def get_target_function(point1, point2):
    """Calculates the coefficients of the line passing through two random points."""
    a = (point2[1] - point1[1]) / (point2[0] - point1[0])  # Slope
    b = point1[1] - a * point1[0]  # Intercept
    return [a, b]

def is_above_line(point):
    """Checks if a point is above a line defined by its coefficients."""
    x = point[0]
    y = point[1]
    return np.sign(x**2 + y**2 - 0.6)

def target_function_to_database(database, coefficients):
    """Assigns labels to points in the database based on their position relative to a line."""
    for i in range(len(database)):
        x = database[i][1]
        y = database[i][2]

        database[i][0] = 1 # Add x0

        database[i][-1] = is_above_line([x, y], coefficients) # Add yn
    
    return database

def add_noise(database, noise_ratio=0.1):
    """Adds noise to the labels of a given ratio of the database."""
    num_noisy_points = int(noise_ratio * len(database))
    noisy_indices = np.random.choice(len(database), num_noisy_points, replace=False)
    database[noisy_indices, -1] = -1 * database[noisy_indices, -1]
    return database

def calculate_out_of_sample_error(g, test_database):
    """Calculates the out-of-sample error of a given hypothesis on a test database."""
    X_test = test_database[:, :3]  
    y_test = test_database[:, -1]  

    predictions = np.sign(X_test @ g)
    error_outside = np.mean(predictions != y_test)
    return error_outside

def linear_regression(d, target_train_database):
    """Calculates the linear regression of a given database."""
    
    X = target_train_database[:, :d+1]  
    y = target_train_database[:, -1]    

    # Calculates (X^T X)^-1 X^T y
    X_transpose = X.T
    pseudo_inverse = np.linalg.inv(X_transpose @ X) @ X_transpose

    g = pseudo_inverse @ y
    predictions = np.sign(X @ g)
    error_inside = np.mean(predictions != y)
    
    return g, error_inside

def run_non_linear_regression(d, N, number_of_executions, transformation = False, hypotheses=None):
    """Executes the non-linear regression algorithm, accepting a transformation of the features."""
    sum_error_inside = 0
    sum_ws = np.zeros(6)
    for _ in range(number_of_executions):
        # Criação do banco de dados
        train_database = create_database(-1, 1, d, N)
        x1 = train_database[:, 1]
        x2 = train_database[:, 2]

        # Aplicação da função target
        train_database[:, 3] = is_above_line([x1, x2])

        # Adição de ruído
        train_database = add_noise(train_database)

        if transformation:
            train_database = transform_features(x1, x2)
            w, _ = linear_regression(5, train_database)
            sum_ws += w

            predicted_values = np.sign(hypotheses[0] + hypotheses[1] * x1 + hypotheses[2] * x2 + hypotheses[3] * x1 * x2 + hypotheses[4] * x1**2 + hypotheses[5] * x2**2)
            real_values = train_database[:, -1]

            error_inside = np.mean(predicted_values != real_values)

        else:
            w, error_inside = linear_regression(2, train_database)
       
        sum_error_inside += error_inside


    if transformation:
        distance = compare_hypotheses(sum_ws / number_of_executions, hypotheses)
        return distance, sum_error_inside / number_of_executions

    return sum_error_inside / number_of_executions

def transform_features(x1, x2):
    """Transform the features of the database."""
    return np.column_stack((np.ones_like(x1), x1, x2, x1 * x2, x1**2, x2**2, is_above_line([x1, x2])))

def compare_hypotheses(w, hypothese):
    """Calculates the distance between two hypotheses."""
    distance = np.linalg.norm(w - hypothese)
    return distance

def estimate_e_out(hypothesis, num_points=1000, num_executions=1000):
    """Estimates the out-of-sample error of a given hypothesis."""
    sum_error_outside = 0

    for _ in range(num_executions):
        # Gerar um novo conjunto de 1000 pontos
        x1 = np.random.uniform(-1, 1, size=num_points)
        x2 = np.random.uniform(-1, 1, size=num_points)
        
        # Calcular os valores previstos usando a hipótese fornecida
        predicted_values = np.sign(-1 - 0.05 * x1 + 0.08 * x2 + 0.13 * x1 * x2 + 1.5 * x1**2 + 1.5 * x2**2)
        
        # Gerar os rótulos reais com base na função alvo
        real_values = np.sign(x1**2 + x2**2 - 0.6)
        
        # Add noise
        noise_database = add_noise(np.column_stack((x1, x2, real_values)))
        real_values = noise_database[:, -1]
        
        # Calcular o erro de classificação fora-da-amostra
        error_outside = np.mean(predicted_values != real_values)
        
        sum_error_outside += error_outside
    
    return sum_error_outside / num_executions

## 1 - Execute a Regressão Linear sem nenhuma transformação, usando o vetor de atributos(1, x2, x2) para encontrar o peso w. Qual é o valor aproximado de classificação do erro médio dentro-de-amostra Ein (medido ao longo de 1000 execuções)?

In [124]:
mean_in_error = run_non_linear_regression(2, 1000, 1000, transformation=False)
print("Non-linear regression for N = 1000")
print("Mean in-sample error: ", mean_in_error)

Non-linear regression for N = 1000
Mean in-sample error:  0.47979100000000063


## 2 - Agora, transforme os N = 1000 dados de treinamento seguindo o vetor de atributos não-linear (1, x2, x2, x1x2, x1<sup>2</sup>, x2<sup>2</sup>). Encontre o vetor we que corresponda à solução da Regressão Linear. Quais das hipóteses a seguir é a mais próxima à que você encontrou? Avalie o resultado médio obtido após 1000 execuções.

### (a) g(x1, x2) = sign(-1 - 0.05x1 + 0.08x2 + 0.13x1x2 + 1.5x1^2 + 1.5x2^2)

In [125]:
distance, error_inside = run_non_linear_regression(2, 1000, 1000, transformation=True, hypotheses=np.array([-1, -0.05, 0.08, 0.13, 1.5, 1.5]))
print("Distance between mean weights and target function (a): ", distance)
print("Mean in-sample error: ", error_inside)

Distance between mean weights and target function (a):  0.6966509863924774
Mean in-sample error:  0.05354699999999978


### (b) g(x1, x2) = sign(-1 - 0.05x1 + 0.08x2 + 0.13x1x2 + 1.5x1^2 + 15x2^2)

In [126]:
distance, error_inside  = run_non_linear_regression(2, 1000, 1000, transformation=True, hypotheses=np.array([-1, -0.05, 0.08, 0.13, 1.5, 15]))
print("Distance between mean weights and target function (b): ", distance)
print("Mean in-sample error: ", error_inside)

Distance between mean weights and target function (b):  13.059743542757625
Mean in-sample error:  0.31115599999999977


### (c) g(x1, x2) = sign(-1 - 0.05x1 + 0.08x2 + 0.13x1x2 + 15x1^2 + 1.5x2^2)

In [127]:
distance, error_inside = run_non_linear_regression(2, 1000, 1000, transformation=True, hypotheses=np.array([-1, -0.05, 0.08, 0.13, 15, 1.5]))
print("Distance between mean weights and target function (c): ", distance)
print("Mean in-sample error: ", error_inside)

Distance between mean weights and target function (c):  13.064652795918663
Mean in-sample error:  0.31104799999999966


### (d) g(x1, x2) = sign(-1 - 1.5x1 + 0.08x2 + 0.13x1x2 + 0.05x1^2 + 0.05x2^2)

In [128]:
distance, error_inside = run_non_linear_regression(2, 1000, 1000, transformation=True, hypotheses=np.array([-1, -1.5, 0.08, 0.13, 0.05, 0.05]))
print("Distance between mean weights and target function (d): ", distance)
print("Mean in-sample error: ", error_inside)

Distance between mean weights and target function (d):  3.0886899516425976
Mean in-sample error:  0.38540699999999994


### (e) g(x1, x2) = sign(-1 - 0.05x1 + 0.08x2 + 1.5x1x2 + 0.15x1^2 + 0.15x2^2)

In [129]:
distance, error_inside = run_non_linear_regression(2, 1000, 1000, transformation=True, hypotheses=np.array([-1, -0.05, 0.08, 1.5, 0.15, 0.15]))
print("Distance between mean weights and target function (e): ", distance)
print("Mean in-sample error: ", error_inside)

Distance between mean weights and target function (e):  2.9629257131505056
Mean in-sample error:  0.4679380000000008


## 3 - Qual o valor mais próximo do erro de classificação fora-de-amostra Eout de sua hipótese na questão anterior? (Estime-o gerando um novo conjunto de 1000 pontos e usando 1000 execuções diferentes, como antes).

In [130]:
error_outside = estimate_e_out(np.array([-1, -0.05, 0.08, 0.13, 1.5, 1.5]))
print("Estimated out-of-sample error for hyphotesis (a): ", error_outside)

Estimated out-of-sample error for hyphotesis (a):  0.14252900000000038
