In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# 1) Read CSV (make sure path is correct)
df = pd.read_csv("Concrete_Data.csv")

# Define the target column
target_col = "Concrete_compressive_strength "

# Double-check that target_col is present
if target_col not in df.columns:
    # Try with space at the end if needed
    target_col = "Concrete_compressive_strength "
    if target_col not in df.columns:
        raise KeyError(f"Target column not found. Please verify your CSV column names.")

In [None]:
# 2) Split into Train (60%), Validation (20%), Test (20%)
df_temp, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temp, test_size=0.25, random_state=42)

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# 3) Fill missing values with median from train (if any)
for col in df_train.columns:
    median_val = df_train[col].median()
    df_train[col] = df_train[col].fillna(median_val)
    df_val[col] = df_val[col].fillna(median_val)
    df_test[col] = df_test[col].fillna(median_val)

In [None]:
# 4) Find top 2 correlated features with the target on the TRAIN set
corrs = []
for col in df_train.columns:
    if col != target_col:
        corrs.append((col, df_train[col].corr(df_train[target_col])))

# Sort by absolute correlation in descending order
corrs_sorted = sorted(corrs, key=lambda x: abs(x[1]), reverse=True)

# Extract top 2
feature1, corr1 = corrs_sorted[0]
feature2, corr2 = corrs_sorted[1]
print(f"Top 2 features: {feature1} (corr: {corr1:.3f}), {feature2} (corr: {corr2:.3f})")

In [None]:
#############################################
# PROBLEM STATEMENT 1: TOP-2 FEATURES
#############################################

In [None]:
# 1. Multiple Linear Regression (Top 2 Features)
X_train = df_train[[feature1, feature2]]
y_train = df_train[target_col]
model = LinearRegression().fit(X_train, y_train)

In [None]:
# Compute RMSE for train, validation, and test
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

X_val = df_val[[feature1, feature2]]
y_val = df_val[target_col]
y_val_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

X_test = df_test[[feature1, feature2]]
y_test = df_test[target_col]
y_test_pred = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

In [None]:
# Plot the best-fit plane on the TRAINING data
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train[feature1], X_train[feature2], y_train, alpha=0.7)

# Create a mesh for plotting the plane
x_surf = np.linspace(X_train[feature1].min(), X_train[feature1].max(), 20)
y_surf = np.linspace(X_train[feature2].min(), X_train[feature2].max(), 20)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

z_surf = model.predict(np.column_stack((x_surf.ravel(), y_surf.ravel())))
z_surf = z_surf.reshape(x_surf.shape)

ax.plot_surface(x_surf, y_surf, z_surf, alpha=0.3)
ax.set_xlabel(feature1)
ax.set_ylabel(feature2)
ax.set_zlabel(target_col)
ax.set_title("Best-Fit Plane (Training Data)")
plt.show()

In [None]:
# Scatter plot: Actual vs. Predicted on TEST data
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.7)
plt.xlabel("Actual Strength (MPa)")
plt.ylabel("Predicted Strength (MPa)")
plt.title("Actual vs Predicted (Test Data)")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.show()

In [None]:
print(f"Linear Regression with Top 2 Features:")
print(f"Train RMSE: {rmse_train:.3f}")
print(f"Validation RMSE: {rmse_val:.3f}")
print(f"Test RMSE: {rmse_test:.3f}")


In [None]:
# 2. Polynomial Regression (Top 2 Features)
degrees = [2, 3, 4, 5]
train_rmse_poly = []
val_rmse_poly = []
models_poly = []

for d in degrees:
    # Create polynomial features
    poly = PolynomialFeatures(degree=d)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    X_test_poly = poly.transform(X_test)
    
    # Train model
    model_poly = LinearRegression().fit(X_train_poly, y_train)
    models_poly.append((poly, model_poly))
    
    # Compute RMSE
    y_train_pred_poly = model_poly.predict(X_train_poly)
    y_val_pred_poly = model_poly.predict(X_val_poly)
    
    rmse_train_poly = np.sqrt(mean_squared_error(y_train, y_train_pred_poly))
    rmse_val_poly = np.sqrt(mean_squared_error(y_val, y_val_pred_poly))
    
    train_rmse_poly.append(rmse_train_poly)
    val_rmse_poly.append(rmse_val_poly)
    
    # Plot best-fit surface for each degree
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_train[feature1], X_train[feature2], y_train, alpha=0.7)
    
    # Create a mesh grid for the surface
    x_surf = np.linspace(X_train[feature1].min(), X_train[feature1].max(), 20)
    y_surf = np.linspace(X_train[feature2].min(), X_train[feature2].max(), 20)
    x_surf, y_surf = np.meshgrid(x_surf, y_surf)
    
    # Predict on the mesh grid
    mesh_points = np.column_stack((x_surf.ravel(), y_surf.ravel()))
    mesh_points_poly = poly.transform(mesh_points)
    z_surf = model_poly.predict(mesh_points_poly).reshape(x_surf.shape)
    
    ax.plot_surface(x_surf, y_surf, z_surf, alpha=0.3)
    ax.set_xlabel(feature1)
    ax.set_ylabel(feature2)
    ax.set_zlabel(target_col)
    ax.set_title(f"Polynomial Regression (Degree {d}) - Training Data")
    plt.show()

In [None]:
# Find best polynomial degree based on validation RMSE
best_degree_idx = np.argmin(val_rmse_poly)
best_degree = degrees[best_degree_idx]
best_poly, best_model_poly = models_poly[best_degree_idx]

# Evaluate on test set
X_test_poly = best_poly.transform(X_test)
y_test_pred_poly = best_model_poly.predict(X_test_poly)
rmse_test_poly = np.sqrt(mean_squared_error(y_test, y_test_pred_poly))

In [None]:
# Bar graph: RMSE vs Polynomial Degree
plt.figure(figsize=(10, 6))
plt.bar(degrees, val_rmse_poly, color='skyblue')
plt.xlabel('Polynomial Degree')
plt.ylabel('Validation RMSE')
plt.title('Validation RMSE vs Polynomial Degree')
plt.xticks(degrees)
plt.show()

In [None]:
# Scatter plot for best polynomial model
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred_poly, alpha=0.7)
plt.xlabel("Actual Strength (MPa)")
plt.ylabel("Predicted Strength (MPa)")
plt.title(f"Polynomial Regression (Degree {best_degree}) - Test Data")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.show()

In [None]:
print(f"\nPolynomial Regression with Top 2 Features:")
print(f"Best Polynomial Degree: {best_degree}")
print(f"Test RMSE: {rmse_test_poly:.3f}")

In [None]:
# 3. Neural Network (Top 2 Features)
# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_tensor = torch.FloatTensor(X_val.values)
y_val_tensor = torch.FloatTensor(y_val.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)

In [None]:
# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Define the NeuralNet class if not already defined
class NeuralNet(nn.Module):
    def __init__(self, input_size=2, hidden_dim=32):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, x):
        return self.network(x)

In [None]:
hidden_dims = [8, 16, 32, 64]
val_rmse_nn = []
models_nn = []
train_losses_list = []

input_size = 2  # For top 2 features

for hidden_dim in hidden_dims:
    model_nn = NeuralNet(input_size=input_size, hidden_dim=hidden_dim)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model_nn.parameters(), lr=0.01)
    
    # Training loop
    epochs = 1000
    train_losses = []
    
    for epoch in range(epochs):
        model_nn.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model_nn(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        
        # Record training loss
        model_nn.eval()
        with torch.no_grad():
            outputs = model_nn(X_train_tensor)
            train_loss = criterion(outputs, y_train_tensor).item()
            train_losses.append(train_loss)
    
    train_losses_list.append(train_losses)
    
    # Evaluate on validation set
    model_nn.eval()
    with torch.no_grad():
        val_pred = model_nn(X_val_tensor)
        val_rmse = np.sqrt(mean_squared_error(y_val_tensor.numpy(), val_pred.numpy()))
        val_rmse_nn.append(val_rmse)
    
    models_nn.append(model_nn)
    
    # Plot best-fit surface
    model_nn.eval()
    fig = plt.figure(figsize=(10, 8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_train[feature1], X_train[feature2], y_train, alpha=0.7)
    
    # Create a mesh grid for the surface
    x_surf = np.linspace(X_train[feature1].min(), X_train[feature1].max(), 20)
    y_surf = np.linspace(X_train[feature2].min(), X_train[feature2].max(), 20)
    x_surf, y_surf = np.meshgrid(x_surf, y_surf)
    
    # Predict on the mesh grid
    mesh_points = np.column_stack((x_surf.ravel(), y_surf.ravel()))
    with torch.no_grad():
        mesh_tensor = torch.FloatTensor(mesh_points)
        z_surf = model_nn(mesh_tensor).numpy().reshape(x_surf.shape)
    
    ax.plot_surface(x_surf, y_surf, z_surf, alpha=0.3)
    ax.set_xlabel(feature1)
    ax.set_ylabel(feature2)
    ax.set_zlabel(target_col)
    ax.set_title(f"Neural Network (Hidden Dim: {hidden_dim}) - Training Data")
    plt.show()


In [None]:
# Find best neural network based on validation RMSE
best_nn_idx = np.argmin(val_rmse_nn)
best_hidden_dim = hidden_dims[best_nn_idx]
best_model_nn = models_nn[best_nn_idx]

In [None]:
# Evaluate on test set
best_model_nn.eval()
with torch.no_grad():
    test_pred_nn = best_model_nn(X_test_tensor)
    rmse_test_nn = np.sqrt(mean_squared_error(y_test_tensor.numpy(), test_pred_nn.numpy()))

In [None]:
# Plot training loss vs epochs for best model
plt.figure(figsize=(10, 6))
plt.plot(train_losses_list[best_nn_idx])
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title(f'Training Loss vs Epochs (Hidden Dim: {best_hidden_dim})')
plt.show()

In [None]:
# Scatter plot for best neural network
plt.figure(figsize=(8, 6))
plt.scatter(y_test.values, test_pred_nn.numpy(), alpha=0.7)
plt.xlabel("Actual Strength (MPa)")
plt.ylabel("Predicted Strength (MPa)")
plt.title(f"Neural Network (Hidden Dim: {best_hidden_dim}) - Test Data")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--')
plt.show()

In [None]:
print(f"\nNeural Network with Top 2 Features:")
print(f"Best Hidden Dimension: {best_hidden_dim}")
print(f"Test RMSE: {rmse_test_nn:.3f}")