In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import time

# --- 1. Data Generation ---

N_SAMPLES = 1_000_000  # Number of instances (1 million)
N_FEATURES = 5         # Number of features
NOISE_LEVEL = 0.5      # Standard deviation of the noise

# y = 1.5*F1 + (-2.2)*F2 + 0.5*F3 + 3.0*F4 + (-0.8)*F5 + noise
TRUE_WEIGHTS = np.array([1.5, -2.2, 0.5, 3.0, -0.8])

print(f"Generating {N_SAMPLES} samples with {N_FEATURES} features...")
X = np.random.rand(N_SAMPLES, N_FEATURES)
noise = np.random.normal(loc=0.0, scale=NOISE_LEVEL, size=N_SAMPLES)
y = np.dot(X, TRUE_WEIGHTS) + noise

# Create DataFrame
feature_cols = [f'Feature_{i+1}' for i in range(N_FEATURES)]
df = pd.DataFrame(X, columns=feature_cols)
df['Target'] = y

print("Data generation complete.")

# --- 2. Data Visualization & Outlier Check (on a sample) ---

print("\n--- Starting Exploratory Data Analysis (EDA) on a 5000-point sample ---")
# Plotting 1 million points is too slow and dense. We'll use a random sample.
SAMPLE_SIZE = 5000
df_sample = df.sample(n=SAMPLE_SIZE, random_state=42)

# a) Scatter plots of each feature vs. the target
print("Generating scatter plots (Feature vs. Target)...")
fig, axes = plt.subplots(nrows=1, ncols=N_FEATURES, figsize=(25, 5))
for i, col in enumerate(feature_cols):
    sns.scatterplot(x=df_sample[col], y=df_sample['Target'], ax=axes[i], alpha=0.5, s=10)
    axes[i].set_title(f'{col} vs. Target', fontsize=10)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Target')
plt.suptitle('Feature vs. Target Scatter Plots', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

# b) Outlier Check using Boxplots
print("Generating boxplots for outlier detection...")
all_cols = feature_cols + ['Target']
fig, axes = plt.subplots(nrows=1, ncols=len(all_cols), figsize=(20, 6))
for i, col in enumerate(all_cols):
    sns.boxplot(y=df_sample[col], ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}', fontsize=10)
plt.suptitle('Outlier Check via Boxplots', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

print("EDA complete. Note: Outliers are expected due to the normal noise distribution.")

# --- 3. Data Split (10% Test) ---

print(f"\nSplitting full dataset ({N_SAMPLES} instances) into 90% train / 10% test...")
X = df.drop('Target', axis=1)
y = df['Target']

# We use the full dataset for training and testing the models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size:     {len(X_test)}")

# --- 4. Model 1: Decision Tree Regressor ---

print("\n--- Training Decision Tree Regressor ---")
start_time = time.time()

# Decision Trees do not require feature scaling
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

dt_duration = time.time() - start_time
print(f"Decision Tree training complete in {dt_duration:.2f} seconds.")

# Predictions and Evaluation
y_pred_dt = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_mae = mean_absolute_error(y_test, y_pred_dt)

print(f"Decision Tree Test MSE: {dt_mse:.4f}")
print(f"Decision Tree Test MAE: {dt_mae:.4f}")

# --- 5. Model 2: Fully Connected Neural Network (FCNN) ---

print("\n--- Training Fully Connected Neural Network (FCNN) ---")

# FCNNs require feature scaling for good performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the FCNN model
fcnn_model = Sequential([
    Dense(64, activation='relu', input_shape=(N_FEATURES,)),
    Dense(32, activation='relu'),
    # Output layer: 1 neuron, linear activation (default) for regression
    Dense(1)
])

# Compile the model
fcnn_model.compile(optimizer=Adam(learning_rate=0.001),
                   loss='mse',  # Use Mean Squared Error as the loss function
                   metrics=['mae']) # Also track Mean Absolute Error

print(fcnn_model.summary())

# Train the FCNN
start_time = time.time()
history = fcnn_model.fit(
    X_train_scaled, y_train,
    epochs=10, # 10 epochs is often enough for this kind of simple problem
    batch_size=256,
    validation_split=0.1, # Use 10% of the training data for validation
    verbose=1
)
fcnn_duration = time.time() - start_time
print(f"FCNN training complete in {fcnn_duration:.2f} seconds.")

# Predictions and Evaluation
y_pred_fcnn = fcnn_model.predict(X_test_scaled).flatten() # flatten to 1D array
fcnn_mse = mean_squared_error(y_test, y_pred_fcnn)
fcnn_mae = mean_absolute_error(y_test, y_pred_fcnn)

print(f"FCNN Test MSE: {fcnn_mse:.4f}")
print(f"FCNN Test MAE: {fcnn_mae:.4f}")


# --- 6. Performance Comparison ---

print("\n--- Model Performance Comparison ---")
results = {
    'Model': ['Decision Tree', 'FCNN'],
    'Training Time (s)': [f"{dt_duration:.2f}", f"{fcnn_duration:.2f}"],
    'Test MSE': [f"{dt_mse:.4f}", f"{fcnn_mse:.4f}"],
    'Test MAE': [f"{dt_mae:.4f}", f"{fcnn_mae:.4f}"]
}
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

print("\n--- Interpretation ---")
print("The FCNN likely performs significantly better (lower MSE/MAE).")
print(f"The theoretical 'best possible' MSE is the variance of the noise, which is {NOISE_LEVEL**2:.4f}.")
print("The FCNN's MSE should be very close to this value, as it is excellent at finding the underlying linear relationship.")
print("The Decision Tree, while fast, struggles to perfectly model this linear sum and is more sensitive to the noise.")