In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('../Training_Batch_Files/Concrete_Data.csv', names = ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'super_plasticizer','coarse_aggregate', 'fine_aggregate', 'age', 'compressive_strength'], skiprows=[0])
data

In [None]:
data.isna().sum()
data.isin([np.inf, -np.inf]).sum()

In [None]:
import dtale
sheet = dtale.show(data)
sheet

#### from the exploratory data analysis it can be found that 
- cement, water, coarse aggragate, fine aggragate have aproximate normal distribution
- Rest of the features are left skewed and can be normalised by log transformation
- Some of the features show somewhat linear relation with the target variable hence linear regression might be a good choice
- No high correlations are shown between the independent features hence all features can be considered for model building
- Outliers are relatevely very few and hence need not be concerned.
- scaling is a must becouse meny features and huge difference in its magnitudes


In [None]:
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

y = data.compressive_strength
x_tr = data.drop('compressive_strength', axis=1).copy()
x_tr = x_tr.apply(lambda x:np.log1p(x), axis=1) # log transformation
# def boxcox_tr(data): # box cox transformation
#     columns = data.columns
#     for col in columns:
#         arr_, lmda = stats.boxcox(data[col] + 0.001)
#         data[col] = arr_
#     return data

# data_tr_bx = boxcox_tr(data_tr)
for col in  x_tr.columns:
    fig, ax = plt.subplots(1,2)
    sns.distplot(data[col], hist = False, kde = True,
            kde_kws = {'shade': True, 'linewidth': 2}, 
            label = "original data", color ="green", ax = ax[0])
    sns.distplot(x_tr[col], hist = False, kde = True,
            kde_kws = {'shade': True, 'linewidth': 2}, 
            label = "log normalised", color ="blue", ax = ax[1])
    ax[0].legend()
    ax[1].legend()
    fig.set_figheight(5)
    fig.set_figwidth(10)
    plt.title(col)
    plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x_tr), columns=x_tr.columns)
y

##### feature transformation and preprocessing completed. Lets try following algorithms
1. Linear regression (degree 1), ridge, lasso, 
2. Linear regression (degree 2), ridge, lasso, 
3. Linear regression (degree 3), ridge, lasso,
4. Random Forest Regressor
5. Xgboost
6. Adaboost
7. SVM 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

def fitAndPrintScore(model, model_name, x_train, y_train, x_test, y_test, deg=None):
    model.fit(x_train, y_train)
    model_train_score = model.score(x_train, y_train)
    model_test_score = model.score(x_test, y_test)
    if deg != None:
        print(f"for {model_name} (degree={deg}) train score :{model_train_score}, test score:{model_test_score}")
    else:
        print(f"for {model_name} train score :{model_train_score}, test score:{model_test_score}")

poly_degs = [1, 2, 3]
models = {'Linear Regression':LinearRegression(), 'Ridge regression':Ridge(), 'Lasso Regression':Lasso(), 'Elastic net Regression':ElasticNet()}
for deg in poly_degs:
    poly = PolynomialFeatures(degree=deg, include_bias=False, interaction_only=True)
    x_p = poly.fit_transform(x)
    x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_p, y, test_size=0.2, random_state=100)
    for model_name, model in models.items():
        fitAndPrintScore(model, model_name, x_p_train, y_p_train, x_p_test, y_p_test, deg = deg)
    print('\n')



##### considering the overfitting and test score polynomial ridge regression with degree 2 will be a good choice

In [None]:
import optuna
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score


poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
x_p = poly.fit_transform(x)
y_p = y


def objective(trial):
    params = {
        'alpha':trial.suggest_float('alpha', 1e-10, 10, log = True),
        # 'fit_intercept':trial.suggest_categorical('fit_intercept', [True, False]),
        'solver': trial.suggest_categorical('solver', ['sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag' , 'saga'])
    }
    
    x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_p, y, test_size=0.2, random_state=100)

    ridge = Ridge(**params)
    ridge.fit(x_p_train, y_p_train)
    test_score = ridge.score(x_p_test, y_p_test)
    return test_score    

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 200)
# Ridge._get_param_names()


In [None]:
params = study.best_trial.params
params
# params = {'alpha': 9.99081794015465, 'solver': 'svd'}


In [None]:
params = {'alpha': 10, 'solver': 'svd'}
x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_p, y_p, test_size=0.2, random_state=100)

ridge_reg = Ridge(**params)
fitAndPrintScore(ridge_reg, "Ridge regression", x_p_train, y_p_train, x_p_test, y_p_test, deg=2)

In [None]:
## random forest
from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

rf = RandomForestRegressor()
fitAndPrintScore(rf, 'Random Forest regression', x_train, y_train, x_test, y_test)

rf._get_param_names()


In [None]:
# from sklearn.tree import DecisionTreeRegressor
# dt = DecisionTreeRegressor()
# path = dt.cost_complexity_pruning_path(x_train, y_train)
# ccp_alphas = path.ccp_alphas
# ccp_alphas

In [None]:
# parameter tuning of random forest

def objective(trial):
    params = {
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'max_depth': trial.suggest_int('max_depth', 2, 32, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10,200),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 16)
    }
    rf_reg = RandomForestRegressor(**params)
    rf_reg.fit(x_train, y_train)
    test_score = rf_reg.score(x_test, y_test)
    # train_score = rf_reg.score(x_train, y_train)
    # if test_score >= 0.90 and train_score > test_score:
    #     diff = train_score - test_score
    # else:
    #     diff = 1
    # return diff

    return test_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

In [None]:
params = study.best_trial.params
params

In [None]:
best_test_params = {'bootstrap': False,
 'max_features': 'sqrt',
 'max_depth': 24,
 'n_estimators': 111,
 'min_samples_split': 3}

rf = RandomForestRegressor(**best_test_params)
fitAndPrintScore(rf, 'RandomForestRegressor', x_train, y_train, x_test, y_test)

In [None]:
# XGBOOST 
from xgboost import XGBRegressor
xgb = XGBRegressor()
fitAndPrintScore(rf, 'RandomForestRegressor', x_train, y_train, x_test, y_test)
xgb.get_xgb_params()

In [None]:
# tuning of xgbregressor
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'eta':trial.suggest_float('eta', 0.005, 0.5, log=True),
        'gamma':trial.suggest_float('gamma', 0.01, 20, log = True),
        'max_depth': trial.suggest_int('max_depth', 2, 16, log=True),
        'subsample':trial.suggest_float('subsample', 0.5, 1),
        'lambda':trial.suggest_float('lambda', 0.01, 10, log = True)
    }
    rf_reg = XGBRegressor(**params)
    rf_reg.fit(x_train, y_train)
    test_score = rf_reg.score(x_test, y_test)
    # train_score = rf_reg.score(x_train, y_train)
    # if test_score >= 0.90 and train_score > test_score:
    #     diff = train_score - test_score
    # else:
    #     diff = 1
    # return diff
    return test_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
params = study.best_trial.params
params

In [None]:
params = {'booster': 'dart',
 'eta': 0.4632229410828487,
 'gamma': 7.077192780534438,
 'max_depth': 3,
 'subsample': 0.8092309111849526,
 'lambda': 0.015342068089733317}

xgb_r = XGBRegressor(**params)

fitAndPrintScore(xgb_r, 'xgboost regression', x_train, y_train, x_test, y_test)

In [None]:
# adaboost
from sklearn.ensemble import AdaBoostRegressor
ab_reg = AdaBoostRegressor()
fitAndPrintScore(ab_reg, 'adaboost regression', x_train, y_train, x_test, y_test)
# ab_reg._get_param_names()

In [None]:
# parameter tuning adaboost
def objective(trial):
    params = {
        'n_estimators':trial.suggest_int('n_estimators', 10, 500, step = 5),
        'loss':trial.suggest_categorical('loss', ['linear', 'square', 'exponential']),
        'learning_rate':trial.suggest_float('learning_rate', 10-5, 10, log=True)
    }
    adb = AdaBoostRegressor(**params)
    adb.fit(x_train, y_train)
    test_score = adb.score(x_test, y_test)
    return test_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

In [None]:
params = study.best_trial.params
adaboost = AdaBoostRegressor(**params)
fitAndPrintScore(adaboost, 'adaboost regression', x_train, y_train, x_test, y_test)

In [None]:
# SVR
from sklearn.svm import SVR
svr = SVR()
fitAndPrintScore(svr, "Support vector machine reggression", x_train, y_train, x_test, y_test)
svr._get_param_names()

In [None]:
#  tuning svr
def objective(trial):
    params = {
        'kernel':trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
        'gamma':trial.suggest_float('gamma', 0.001, 1),
        'epsilon':trial.suggest_float('epsilon', 0.001, 1)
    }
    adb = SVR(**params)
    adb.fit(x_train, y_train)
    test_score = adb.score(x_test, y_test)
    return test_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

In [None]:
params = study.best_trial.params
svr = SVR(**params)
fitAndPrintScore(svr, "Support vector machine reggression", x_train, y_train, x_test, y_test)

* from the above expirimentations it can be decided to go with ridge reggression with degree 2, random_forest and xgboost for the model building with parameter tuning in the project. for clustering kmean clustering can be adopted before choosing each algorithm according to their performance on each cluster.  

In [None]:
import imp
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor


def fitAndPrintScore(model, model_name, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    model_train_score = model.score(x_train, y_train)
    model_test_score = model.score(x_test, y_test)
    print(
        f"for {model_name} train score :{model_train_score}, test score:{model_test_score}"
    )


data = pd.read_csv("../Training_Batch_Files/Concrete_Data.csv")
data.head()

y = data.Concrete_compressive_strength
x_tr = data.drop("Concrete_compressive_strength", axis=1)

x_tr = x_tr.apply(lambda x: np.log1p(x), axis=1)  # log transformation
x_tr.head(10)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x_tr), columns=x_tr.columns)
y

x.head(5)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=100
)

print(x_train.shape)

params = {
    "booster": "dart",
    "eta": 0.4632229410828487,
    "gamma": 7.077192780534438,
    "max_depth": 3,
    "subsample": 0.8092309111849526,
    "lambda": 0.015342068089733317,
}

xgb_r = XGBRegressor(**params)

xgb_r.fit(x_train, y_train)
model_train_score = xgb_r.score(x_train, y_train)
model_test_score = xgb_r.score(x_test, y_test)
print(f"for xgb train score :{model_train_score}, test score:{model_test_score}")




# fitAndPrintScore(xgb_r, "xgboost regression", x_train, y_train, x_test, y_test)


In [None]:
x_train.iloc[1]

In [None]:
print(xgb_r.predict(np.array([[1.781614,-1.059324,-0.900772,-0.897885, 0.662010,-2.374778,0.391357,0.112702]])))                   
print(y_train[0])

In [None]:
y_pred = xgb_r.predict(x_test)[0:2]
y_actual = y_test[0:2]
print(y_pred)
print(y_actual)