In [None]:
# Install required packages

!pip install tensorflow==2.18.0
!pip install keras==3.7.0
!pip install torch==2.5.1
!pip install torchvision==0.20.1

!pip install numpy==2.0.2
!pip install scipy==1.14.1
!pip install pandas==2.2.3

!pip install scikit-learn==1.5.2

!pip install matplotlib==3.9.2

!pip install joblib==1.4.2
!pip install python-dateutil==2.9.0.post0

!pip install sympy==1.13.1
!pip install opt-einsum==3.4.0

!pip install tensorboard==2.18.0
!pip install protobuf==5.29.0
!pip install threadpoolctl==3.5.0
!pip install packaging==24.2


#1. Import Necessary Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

* numpy: For numerical computations, particularly for generating and manipulating arrays.

* matplotlib.pyplot: Used for plotting graphs, such as loss curves or complexity analysis.

* DecisionTreeRegressor: Implements a decision tree model for regression tasks.

* mean_squared_error: Calculates the error between predicted and true values.

* train_test_split: Splits the dataset into training and testing subsets to evaluate model performance.

#2. Generate Synthetic Dataset

In [None]:
# Generate synthetic dataset
np.random.seed(42)
n_samples = 100  # Increased dataset size for reliability
X = np.random.uniform(-1, 1, size=(n_samples, 1))
y = np.sin(2 * np.pi * X).ravel() + 1 * np.random.normal(size=n_samples)  # Reduced noise

* np.random.seed(42): Sets a fixed seed for reproducibility.
* Dataset:
   * X: Input data, uniformly distributed between -1 and 1.
   * y: Output data generated using a sine function, with added Gaussian noise (
𝜎
=
1).
   * Purpose: Mimics a regression problem, where a noisier dataset adds realism.

#3. Split the Dataset

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 10, random_state=42)


* Splits the data into training and testing sets.
    * Training set: Used to fit the model.
    * Testing set: Used to evaluate the model's generalization.
* test_size=10: Ensures the test set contains 10 samples.
* random_state=42: Ensures reproducibility of splits.

#4. Initialize Storage for Results

In [None]:
# Initialize arrays for results
train_errors = []
test_errors = []
degrees = np.arange(1, 200)  # Model complexity: Polynomial degrees

* train_errors and test_errors: Lists to store mean squared errors for training and testing datasets across polynomial degrees.
* degrees: Specifies the range of polynomial degrees to test, increasing model complexity incrementally from 1 to 199.


#5. Iterate Over Polynomial Degrees

In [None]:
# Iterate over polynomial degrees
for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # Fit polynomial regression model with regularization
    model = Ridge(alpha=0.0)  # Small regularization to stabilize
    model.fit(X_train_poly, y_train)

    # Calculate train and test errors
    y_train_pred = model.predict(X_train_poly)
    y_test_pred = model.predict(X_test_poly)
    train_errors.append(mean_squared_error(y_train, y_train_pred))
    test_errors.append(mean_squared_error(y_test, y_test_pred))

1.Data Transformation:

* PolynomialFeatures: Expands input features into polynomial terms of the specified degree.
* fit_transform: Fits and transforms the training data.
* transform: Applies the same transformation to the test data.
2. Model Training:

* Ridge(alpha=0.0): Performs linear regression with no additional regularization. This helps stabilize the polynomial regression but does not heavily penalize large coefficients.
3. Predictions:

* y_train_pred: Predictions on the training set.
* y_test_pred: Predictions on the test set.
4. Error Calculation:

* Mean squared errors for both training and test sets are computed and stored.

#6. Visualize Results

In [None]:
# Plot results
plt.figure(figsize=(10, 6))
plt.plot(degrees, train_errors, label='Train Loss', marker='o')
plt.plot(degrees, test_errors, label='Test Loss', marker='o')
plt.yscale('log')
plt.xlabel('Model Complexity (Polynomial Degree)')
plt.ylabel('Mean Squared Error (Log Scale)')
plt.title('Double Descent in Polynomial Regression (Improved Setup)')
plt.legend()
plt.grid()
plt.show()

Plot Setup:

X-Axis: Polynomial degree (model complexity).
Y-Axis: Mean squared error in logarithmic scale for better visualization of trends.
Data Representation:

train_errors: Plotted as a line with markers to show the training loss at each degree.
test_errors: Plotted similarly for test loss.
Title and Labels: Clearly describe the relationship between model complexity and generalization performance.

Log Scale: Helps highlight smaller variations in mean squared errors.