In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import xgboost as xg
import pickle
import matplotlib.pyplot as plt

In [2]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [3]:
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2

In [4]:
def Linear(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [5]:
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [6]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [7]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training setC
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [8]:
def random(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [9]:
def xgboost(X_train, y_train, X_test):
    from xgboost import XGBRegressor
    regressor = XGBRegressor(n_estimators=100, random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

In [10]:
def rfeFeature(indep_X,dep_Y,n):
        rfelist=[]
        
        from sklearn.linear_model import LinearRegression
        lin = LinearRegression()
        
        from sklearn.svm import SVR
        SVRl = SVR(kernel = 'linear')
        
        from sklearn.svm import SVR
        #SVRnl = SVR(kernel = 'rbf')
        
        from sklearn.tree import DecisionTreeRegressor
        dec = DecisionTreeRegressor(random_state = 0)
        
        from sklearn.ensemble import RandomForestRegressor
        rf = RandomForestRegressor(n_estimators = 10, random_state = 0)

        from xgboost import XGBRegressor
        xgb = XGBRegressor(n_estimators=10, random_state=0)
        
        
        rfemodellist=[lin,SVRl,dec,rf,xgb] 
        for i in   rfemodellist:
            print(i)
            log_rfe = RFE(i, n_features_to_select=n)
            log_fit = log_rfe.fit(indep_X, dep_Y)
            log_rfe_feature=log_fit.transform(indep_X)
            rfelist.append(log_rfe_feature)
            selected_columns = indep_X.columns[log_rfe.get_support()]
        return rfelist, selected_columns

In [11]:
dataset1=pd.read_csv("Concrete_Data.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)

In [12]:
indep_X=df2.drop('csMPa', axis=1)
dep_Y=df2['csMPa']

In [13]:
def rfe_regression(acclin,accsvml,accdes,accrf,accxgb): 
    
    rfedataframe=pd.DataFrame(index=['Linear','SVC','Random','DecisionTree','XGB'],columns=['Linear','SVMl',
                                                                                        'Decision','Random', 'XGB'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Linear'][idex]=acclin[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
        rfedataframe['XGB'][idex]=accxgb[number]
    return rfedataframe

In [14]:
rfelist,selected_columns=rfeFeature(indep_X,dep_Y,5)       
print("selected columns:", selected_columns)
acclin=[]
accsvml=[]
accdes=[]
accrf=[]
accxgb=[]

LinearRegression()
SVR(kernel='linear')
DecisionTreeRegressor(random_state=0)
RandomForestRegressor(n_estimators=10, random_state=0)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=10,
             n_jobs=None, num_parallel_tree=None, ...)
selected columns: Index(['cement', 'slag', 'water', 'superplasticizer', 'age'], dtype='object')


In [15]:
for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)  
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
    
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
     
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
    
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)

    r2_xgb= xgboost(X_train, y_train, X_test)
    accxgb.append(r2_xgb)
    
    
result=rfe_regression(acclin,accsvml,accdes,accrf,accxgb)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  rfedataframe['Linear'][idex]=acclin[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFram

In [16]:
#5
result

Unnamed: 0,Linear,SVMl,Decision,Random,XGB
Linear,0.60569,0.595759,0.818562,0.897536,0.91538
SVC,0.60569,0.595759,0.818562,0.897536,0.91538
Random,0.590297,0.590961,0.817975,0.882264,0.916733
DecisionTree,0.60569,0.595759,0.818562,0.897536,0.91538
XGB,0.60569,0.595759,0.818562,0.897536,0.91538


### Totally we have 8 Features and we tried from 4 to 8 and the overall good performance came from 5 to 8 features
### So am selecting 5 features and selecting XGB regression model

### Training the model

In [24]:
# from indep_X we are giving the seleceted features columns 
X_selected=indep_X[selected_columns]

In [25]:
X_selected

Unnamed: 0,cement,slag,water,superplasticizer,age
0,540.0,0.0,162.0,2.5,28
1,540.0,0.0,162.0,2.5,28
2,332.5,142.5,228.0,0.0,270
3,332.5,142.5,228.0,0.0,365
4,198.6,132.4,192.0,0.0,360
...,...,...,...,...,...
1025,276.4,116.0,179.6,8.9,28
1026,322.2,0.0,196.0,10.4,28
1027,148.5,139.4,192.7,6.1,28
1028,159.1,186.7,175.6,11.3,28


In [27]:
# For model we no need to give all the input values, so we are giving selected columns as input and doing scaler again

X_train, X_test, y_train, y_test = train_test_split(X_selected, dep_Y, test_size = 0.25, random_state = 0)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
param_grid={'n_estimators':[10,50,100,150],'learning_rate': [0.01, 0.1,0.2,0.3],'max_depth': [3, 5, 7],'subsample': [0.8, 1.0],
            'booster':['gbtree', 'dart'],'colsample_bytree': [0.8, 1.0]}
grid=GridSearchCV(XGBRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(X_train_scaled,y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


In [31]:
re=grid.cv_results_
grid_predictions=grid.predict(X_test_scaled)
from sklearn.metrics import r2_score
rx_sc=r2_score(y_test,grid_predictions)
print("The R_score value for best parameter:{}".format(grid.best_params_),rx_sc)

The R_score value for best parameter:{'booster': 'gbtree', 'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8} 0.9225221712656226


In [None]:
table=pd.DataFrame.from_dict(re)
table

In [32]:
cement_input=float(input("Cement:"))
slag_input=float(input("Slag:"))
water_input=float(input("Water:"))
superplasticizer_input=float(input("Superplasticizer:"))
age_input=int(input("Age:"))

Cement: 500
Slag: 143
Water: 220
Superplasticizer: 3.1
Age: 45


In [33]:
Future_Prediction=grid.predict([[cement_input,slag_input,water_input,superplasticizer_input,age_input]])
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[65.94959]


In [36]:
pickle.dump(grid.best_estimator_, open("xgb_model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(selected_columns, open("selected_features.pkl", "wb"))
filename="CS_Pre.sav"
pickle.dump(grid.best_estimator_,open(filename, "wb"))

In [None]:
model=pickle.load(open("CS_Pre.sav", "rb"))