In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PowerTransformer

In [None]:
df=pd.read_csv('concrete_data.csv')

I choose this data because some data is not normally distributed

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
X=df.drop('Strength',axis=1)
y=df['Strength']
# this way also
# X=df.drop(columns=['Strength'])
# Y=df.iloc[ : ,-1]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Apply linear regression without any transformation

In [None]:
lr=LinearRegression()
lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)

mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print('MSE:',mse)
print('R2:',r2)

In [None]:
# Plotting the distplots without any transformation

for col in X_train.columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(X_train[col])
    plt.title(col)

    plt.subplot(122)
    stats.probplot(X_train[col], dist="norm", plot=plt)
    plt.title(col)

    plt.show()

# Applying Box-Cox Transform

In [None]:
# Applying Box-Cox Transform

pt = PowerTransformer(method='box-cox')

X_train_transformed = pt.fit_transform(X_train+0.000001)
#I add 0.000001 to remove the 0 because the box_lox is not apply on zero (x>0)
X_test_transformed = pt.transform(X_test+0.000001)

pd.DataFrame({'cols':X_train.columns,'box_cox_lambdas':pt.lambdas_})

#applying linear regression on transformed data

In [None]:
lr=LinearRegression()
lr.fit(X_train_transformed,y_train)

y_pred=lr.predict(X_test_transformed)

mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print('MSE:',mse)
print('R2:',r2)

# Using cross value score

In [None]:
pt=PowerTransformer(method='box-cox')
X_transformed=pt.fit_transform(X+0.000001)

lr=LinearRegression()
np.mean(cross_val_score(lr,X_transformed,y,scoring='r2'))

In [None]:
# Before and after comparision for Box-Cox Plot
X_train_transformed = pd.DataFrame(X_train_transformed,columns=X_train.columns)

for col in X_train_transformed.columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(X_train[col])
    plt.title(col)

    plt.subplot(122)
    sns.distplot(X_train_transformed[col])
    plt.title(col)

    plt.show()

# Apply Yeo-Johnson transform

In [None]:


pt1 = PowerTransformer()

X_train_transformed2 = pt1.fit_transform(X_train)
X_test_transformed2 = pt1.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_transformed2,y_train)

y_pred3 = lr.predict(X_test_transformed2)

print(r2_score(y_test,y_pred3))

pd.DataFrame({'cols':X_train.columns,'Yeo_Johnson_lambdas':pt1.lambdas_})

In [None]:
# applying cross val score

pt = PowerTransformer()
X_transformed2 = pt.fit_transform(X)

lr = LinearRegression()
np.mean(cross_val_score(lr,X_transformed2,y,scoring='r2'))

In [None]:
X_train_transformed2 = pd.DataFrame(X_train_transformed2,columns=X_train.columns)

In [None]:
# Before and after comparision for Yeo-Johnson

for col in X_train_transformed2.columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(X_train[col])
    plt.title(col)

    plt.subplot(122)
    sns.distplot(X_train_transformed2[col])
    plt.title(col)

    plt.show()

# Side by side Lambdas compare

In [None]:

pd.DataFrame({'cols':X_train.columns,'box_cox_lambdas':pt.lambdas_,'Yeo_Johnson_lambdas':pt1.lambdas_})