In [194]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline


In [248]:
data=pd.read_csv("concrete_data.csv")
data.sample(5)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
950,326.5,0.0,137.9,199.0,10.8,801.1,792.5,28,38.63
946,149.5,236.0,0.0,175.8,12.6,846.8,892.7,28,32.96
514,424.0,22.0,132.0,168.0,8.9,822.0,750.0,56,74.36
1024,166.0,259.7,0.0,183.2,12.7,858.8,826.8,28,37.92
776,339.0,0.0,0.0,185.0,0.0,1069.0,754.0,7,21.16


In [197]:
data.replace(0.0,np.nan,inplace=True)

In [200]:
data.isnull().mean()*100

Cement                 0.000000
Blast Furnace Slag    45.728155
Fly Ash               54.951456
Water                  0.000000
Superplasticizer      36.796117
Coarse Aggregate       0.000000
Fine Aggregate         0.000000
Age                    0.000000
Strength               0.000000
dtype: float64

In [202]:
data.shape

(1030, 9)

In [204]:
data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,559.0,464.0,1030.0,651.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,136.158676,120.288793,181.567282,9.816897,972.918932,773.580485,45.662136,35.817961
std,104.506364,72.351823,33.67547,21.354219,4.580328,77.753954,80.17598,63.169912,16.705742
min,102.0,11.0,24.5,121.8,1.7,801.0,594.0,1.0,2.33
25%,192.375,95.0,97.85,164.9,6.95,932.0,730.95,7.0,23.71
50%,272.9,135.7,121.4,185.0,9.4,968.0,779.5,28.0,34.445
75%,350.0,189.0,141.0,192.0,11.6,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [205]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  559 non-null    float64
 2   Fly Ash             464 non-null    float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    651 non-null    float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.6 KB


In [208]:
data.corr()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
Cement,1.0,-0.344538,-0.37793,-0.081587,0.363703,-0.109349,-0.222718,0.081946,0.497832
Blast Furnace Slag,-0.344538,1.0,0.018471,0.232275,0.069789,0.012455,-0.227038,-0.043894,-0.100259
Fly Ash,-0.37793,0.018471,1.0,-0.058028,0.053901,-0.017262,-0.173397,-0.008452,-0.231618
Water,-0.081587,0.232275,-0.058028,1.0,-0.538191,-0.182294,-0.450661,0.277618,-0.289633
Superplasticizer,0.363703,0.069789,0.053901,-0.538191,1.0,-0.202265,0.20765,0.009493,0.284473
Coarse Aggregate,-0.109349,0.012455,-0.017262,-0.182294,-0.202265,1.0,-0.178481,-0.003016,-0.164935
Fine Aggregate,-0.222718,-0.227038,-0.173397,-0.450661,0.20765,-0.178481,1.0,-0.156095,-0.167241
Age,0.081946,-0.043894,-0.008452,0.277618,0.009493,-0.003016,-0.156095,1.0,0.328873
Strength,0.497832,-0.100259,-0.231618,-0.289633,0.284473,-0.164935,-0.167241,0.328873,1.0


In [210]:
#handling outlier
colm=["Blast Furnace Slag","Fly Ash","Water","Superplasticizer","Fine Aggregate","Age"]
for x in colm:
    q3=data[x].quantile(0.75)
    q1=data[x].quantile(0.25)
    IQR=q3-q1
    upper_limit=q3+(1.5*IQR)
    lower_limit=q1-(1.5*IQR)
    data[x]=np.where(data[x]>upper_limit,upper_limit,np.where(data[x]<lower_limit,lower_limit,data[x]))
    

In [212]:
data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,559.0,464.0,1030.0,651.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,136.010197,120.567619,181.543252,9.557488,972.918932,773.439587,38.070388,35.817961
std,104.506364,71.926875,32.906494,21.225052,3.728667,77.753954,79.815303,35.782271,16.705742
min,102.0,11.0,33.125,124.25,1.7,801.0,594.0,1.0,2.33
25%,192.375,95.0,97.85,164.9,6.95,932.0,730.95,7.0,23.71
50%,272.9,135.7,121.4,185.0,9.4,968.0,779.5,28.0,34.445
75%,350.0,189.0,141.0,192.0,11.6,1029.4,824.0,56.0,46.135
max,540.0,330.0,200.1,232.65,18.575,1145.0,963.575,129.5,82.6


In [214]:
X=data.iloc[:,:-1]
Y=data["Strength"]

In [216]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)

In [218]:
x_train.columns

Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age'],
      dtype='object')

In [224]:
def safe_log1p(X):
    X = X.astype(float)  # Ensure X is float
    return np.log1p(X, where=(X >= 0))

# Convert x_train and x_test to NumPy arrays if they are DataFrames
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.values
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.values

In [226]:
impu=ColumnTransformer([("Imputer",SimpleImputer(strategy="median"),[1, 2, 4])],remainder="passthrough")

In [228]:
functrans=ColumnTransformer([("function transf",FunctionTransformer(np.log1p),[5,6])],remainder="passthrough")

In [230]:
powtrans=ColumnTransformer([("power transfromer",PowerTransformer(method="yeo-johnson"),[0,2,3,7])],remainder="passthrough")

In [232]:
model_fit=LinearRegression()

In [234]:
pipe=Pipeline([("Imputation",impu),
               ("Function Transformer",functrans),
               ("Power Transformer", powtrans),
               ("Model training", model_fit)])

In [236]:
print(pipe)

Pipeline(steps=[('Imputation',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('Imputer',
                                                  SimpleImputer(strategy='median'),
                                                  [1, 2, 4])])),
                ('Function Transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('function transf',
                                                  FunctionTransformer(func=<ufunc 'log1p'>),
                                                  [5, 6])])),
                ('Power Transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('power transfromer',
                                                  PowerTransformer(),
                                                  [0, 2, 3, 7])])),
                ('Model training', LinearRegression())])


In [238]:
train_model=pipe.fit(x_train,y_train)

In [240]:
# After preprocessing
pred1=pipe.predict(x_test)
acu=r2_score(y_test,pred1)
acu

0.7727418583474868

In [242]:
import pickle
with open("model31.pkl", "wb") as file:
    pickle.dump(pipe,file)