In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# 1) Read CSV (make sure path is correct)
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\AI lab\\FinalPracticelab\\Concrete_Data.csv")

# Now define the target. 
# If your CSV truly has "Concrete_compressive_strength" from the start, no rename needed.
target_col = "Concrete_compressive_strength "

# Double-check that target_col is present after rename attempts
if target_col not in df.columns:
    raise KeyError(f"Column '{target_col}' not found. Please verify your CSV column names.")

# 2) Split into Train (60%), Validation (20%), Test (20%)
df_temp, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temp, test_size=0.25, random_state=42)

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

# 3) Fill missing values with median from train (if any)
for col in df_train.columns:
    median_val = df_train[col].median()
    df_train[col] = df_train[col].fillna(median_val)
    df_val[col]   = df_val[col].fillna(median_val)
    df_test[col]  = df_test[col].fillna(median_val)

# 4) Find top 2 correlated features with the target on the TRAIN set
corrs = []
for col in df_train.columns:
    if col != target_col:
        corrs.append((col, df_train[col].corr(df_train[target_col])))

# Sort by absolute correlation in descending order
corrs_sorted = sorted(corrs, key=lambda x: abs(x[1]), reverse=True)

# Extract top 2
feature1, corr1 = corrs_sorted[0]
feature2, corr2 = corrs_sorted[1]

# 5) Train a linear regression with the top 2 features
X_train = df_train[[feature1, feature2]]
y_train = df_train[target_col]
model = LinearRegression().fit(X_train, y_train)

# 6) Compute RMSE for train, validation, and test
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))

X_val = df_val[[feature1, feature2]]
y_val = df_val[target_col]
y_val_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

X_test = df_test[[feature1, feature2]]
y_test = df_test[target_col]
y_test_pred = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# 7) Plot the best-fit plane on the TRAINING data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_train[feature1], X_train[feature2], y_train, alpha=0.7)

# Create a mesh for plotting the plane
x_surf = np.linspace(X_train[feature1].min(), X_train[feature1].max(), 20)
y_surf = np.linspace(X_train[feature2].min(), X_train[feature2].max(), 20)
x_surf, y_surf = np.meshgrid(x_surf, y_surf)

z_surf = model.predict(np.column_stack((x_surf.ravel(), y_surf.ravel())))
z_surf = z_surf.reshape(x_surf.shape)

ax.plot_surface(x_surf, y_surf, z_surf, alpha=0.3)
ax.set_xlabel(feature1)
ax.set_ylabel(feature2)
ax.set_zlabel(target_col)
ax.set_title("Best-Fit Plane (Training Data)")
plt.show()

# 8) Scatter plot: Actual vs. Predicted on TEST data
plt.figure()
plt.scatter(y_test, y_test_pred, alpha=0.7)
plt.xlabel("Actual Strength (MPa)")
plt.ylabel("Predicted Strength (MPa)")
plt.title("Actual vs Predicted (Test Data)")
plt.show()

# Print RMSE results and top 2 features
print(f"Train RMSE: {rmse_train:.3f}")
print(f"Validation RMSE: {rmse_val:.3f}")
print(f"Test RMSE: {rmse_test:.3f}")
print(f"\nTop 2 features used: {feature1}, {feature2}")
