In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

# Load dataset
df = pd.read_csv("cement_data_large.csv")

# Check for missing values
if df.isnull().sum().sum() > 0:
    df = df.dropna()

# Separate features and target
X = df.drop("Compressive Strength", axis=1)
y = df["Compressive Strength"]

# Describe dataset
print("Dataset Description:")
print(df.describe())

# Correlation matrix
print("\nCorrelation Matrix:")
print(df.corr())

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

Dataset Description:
           Cement        Slag     Fly Ash       Water  Superplasticizer  \
count  500.000000  500.000000  500.000000  500.000000        500.000000   
mean   349.280856  144.585418  103.511624  184.541944         14.995308   
std    149.344204   85.648037   59.438587   37.311281          8.574275   
min    102.530792    1.389607    0.987996  120.418374          0.046953   
25%    220.639845   68.729774   48.245610  151.339655          8.043366   
50%    356.581874  141.546470  107.947667  186.155874         14.880374   
75%    478.062441  217.901046  155.468752  215.858913         22.298801   
max    596.482398  299.915302  199.882745  249.785176         29.863125   

       Coarse Aggregate  Fine Aggregate         Age  Compressive Strength  
count        500.000000      500.000000  500.000000            500.000000  
mean        1001.987137      649.352033  102.106000             87.295841  
std          118.289318       84.896280  130.409678             21.115730  