## ARNAB MONDAL
## REG NO: 24MDT0177
## ML_LAB_8
## MSc DATA SCIENCE

In [13]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [14]:
df = pd.read_csv("Book1.csv")
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus
0,13300000,7420,4,2,3,2,furnished
1,12250000,8960,4,4,4,3,furnished


In [15]:
df = df.drop(["furnishingstatus"], axis=1)
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,13300000,7420,4,2,3,2
1,12250000,8960,4,4,4,3


In [16]:
mms = preprocessing.MinMaxScaler()
data = mms.fit_transform(df)
df = pd.DataFrame(data=data, columns=df.columns)
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,1.0,0.356777,0.5,0.333333,0.666667,0.666667
1,0.880096,0.469597,0.5,1.0,1.0,1.0


In [17]:
x = df.iloc[:,1:].values
y = df.iloc[:,0].values
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=0)

## 1. DECISION TREE REGRESSOR

In [18]:
dtr = DecisionTreeRegressor()
dtr.fit(xtrain,ytrain)

In [19]:
y_pred = dtr.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
mse

0.036702046125778126

## 2. GRADIENT BOOSTING REGRESSOR

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
gbr = GradientBoostingRegressor(learning_rate=0.01,n_estimators=100,max_depth=3,random_state=0)
gbr.fit(xtrain,ytrain)

In [22]:
y_pred = gbr.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
mse 

0.017811289322634986

#### The MSE is lesser for GBR.

## 3. HYPERPARAMETER TUNING (GBR)

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [i for i in range(100,1001,100)],
    'max_depth' : list(np.arange(1,11,1)),
    'learning_rate' : list(np.linspace(0.01,0.1,10))
}
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1)
grid_search.fit(xtrain,ytrain)

In [25]:
y_pred = grid_search.predict(xtest)

In [26]:
grid_search.best_params_

{'learning_rate': np.float64(0.07),
 'max_depth': np.int64(1),
 'n_estimators': 100}

In [28]:
grid_search.best_score_

np.float64(0.21237317725275967)

In [29]:
best_model = grid_search.best_estimator_  
y_pred = best_model.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.016476533339920073


## 4. HYPERPARAMETER TUNING (DTR)

In [30]:
model = DecisionTreeRegressor()
param_grid = {
    'max_depth' : list(np.arange(1,11,1)),
    'min_samples_split': [i for i in range(1,11)],
    'min_samples_leaf': [i for i in range(1,11)]
}
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1)
grid_search.fit(xtrain,ytrain)

500 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "e:\VIT Study Materials\SEM 2\Data Mining and M

In [31]:
y_pred = grid_search.predict(xtest)

In [32]:
grid_search.best_params_

{'max_depth': np.int64(4), 'min_samples_leaf': 10, 'min_samples_split': 2}

In [34]:
grid_search.best_score_

np.float64(0.2815179012541665)

In [35]:
best_model = grid_search.best_estimator_  
y_pred = best_model.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.015803941214634


## 4. LIVER_PATIENT.CSV

In [36]:
df = pd.read_csv("liver_patient.csv")
df.head(2)

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,liver_disease
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1


In [37]:
df = df.drop(['Age','Gender'], axis=1)
df.tail(2)

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,liver_disease
581,1.3,0.5,184,29,32,6.8,3.4,1.0,1
582,1.0,0.3,216,21,24,7.3,4.4,1.5,0


In [38]:
mms = preprocessing.MinMaxScaler()
data = mms.fit_transform(df)
df = pd.DataFrame(data=data, columns=df.columns)
df.head(2)

Unnamed: 0,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,liver_disease
0,0.004021,0.0,0.060576,0.003015,0.001626,0.594203,0.521739,0.24,1.0
1,0.140751,0.27551,0.310699,0.027136,0.018296,0.695652,0.5,0.176,1.0


In [39]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=0)

## A. DECISION TREE CLASSIFIER

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [41]:
dtc = DecisionTreeClassifier()
dtc.fit(xtrain,ytrain)

In [42]:
y_pred = dtc.predict(xtest)
acc = accuracy_score(ytest, y_pred)
acc

0.6153846153846154

## B. GRADIENT BOOSTING CLASSIFIER

In [43]:
from sklearn.ensemble import GradientBoostingClassifier

In [44]:
gbc = GradientBoostingClassifier()
gbc.fit(xtrain,ytrain)

In [45]:
y_pred = gbc.predict(xtest)
acc = accuracy_score(ytest, y_pred)
acc

0.6837606837606838

## C. HYPERPARAMETER TUNING (GBC)

In [46]:
model = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [i for i in range(100,1001,100)],
    'max_depth' : list(np.arange(1,11,1)),
    'learning_rate' : list(np.linspace(0.01,0.1,10))
}
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1)
grid_search.fit(xtrain,ytrain)

In [47]:
y_pred = grid_search.predict(xtest)

In [48]:
grid_search.best_params_

{'learning_rate': np.float64(0.01),
 'max_depth': np.int64(1),
 'n_estimators': 500}

In [49]:
grid_search.best_score_

np.float64(0.1104811609103564)

In [50]:
best_model = grid_search.best_estimator_  
y_pred = best_model.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.19527977242993994


## D. HYPERPARAMETER TUNING (DTC)

In [51]:
model = DecisionTreeRegressor()
param_grid = {
    'max_depth' : list(np.arange(1,11,1)),
    'min_samples_split': [i for i in range(1,11)],
    'min_samples_leaf': [i for i in range(1,11)]
}
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=5,n_jobs=-1)
grid_search.fit(xtrain,ytrain)

500 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "e:\VIT Study Materials\SEM 2\Data Mining and ML\LAB\.venv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "e:\VIT Study Materials\SEM 2\Data Mining and M

In [52]:
y_pred = grid_search.predict(xtest)

In [53]:
grid_search.best_params_

{'max_depth': np.int64(2), 'min_samples_leaf': 1, 'min_samples_split': 2}

In [54]:
grid_search.best_score_

np.float64(0.03545784112387294)

In [55]:
best_model = grid_search.best_estimator_  
y_pred = best_model.predict(xtest)
mse = mean_squared_error(ytest, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.1911615612583772


## 5. RIDGE AND LASSO REGRESSION

In [56]:
from sklearn.linear_model import RidgeCV, LassoCV

In [57]:
df = pd.read_csv("Book1.csv")
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus
0,13300000,7420,4,2,3,2,furnished
1,12250000,8960,4,4,4,3,furnished


In [58]:
df = df.drop(["furnishingstatus"], axis=1)
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,13300000,7420,4,2,3,2
1,12250000,8960,4,4,4,3


In [59]:
mms = preprocessing.MinMaxScaler()
data = mms.fit_transform(df)
df = pd.DataFrame(data=data, columns=df.columns)
df.head(2)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,1.0,0.356777,0.5,0.333333,0.666667,0.666667
1,0.880096,0.469597,0.5,1.0,1.0,1.0


In [60]:
x = df.iloc[:,1:].values
y = df.iloc[:,0].values
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=0)

In [61]:
alpha_values = np.logspace(-2,4,100)
alpha_values

array([1.00000000e-02, 1.14975700e-02, 1.32194115e-02, 1.51991108e-02,
       1.74752840e-02, 2.00923300e-02, 2.31012970e-02, 2.65608778e-02,
       3.05385551e-02, 3.51119173e-02, 4.03701726e-02, 4.64158883e-02,
       5.33669923e-02, 6.13590727e-02, 7.05480231e-02, 8.11130831e-02,
       9.32603347e-02, 1.07226722e-01, 1.23284674e-01, 1.41747416e-01,
       1.62975083e-01, 1.87381742e-01, 2.15443469e-01, 2.47707636e-01,
       2.84803587e-01, 3.27454916e-01, 3.76493581e-01, 4.32876128e-01,
       4.97702356e-01, 5.72236766e-01, 6.57933225e-01, 7.56463328e-01,
       8.69749003e-01, 1.00000000e+00, 1.14975700e+00, 1.32194115e+00,
       1.51991108e+00, 1.74752840e+00, 2.00923300e+00, 2.31012970e+00,
       2.65608778e+00, 3.05385551e+00, 3.51119173e+00, 4.03701726e+00,
       4.64158883e+00, 5.33669923e+00, 6.13590727e+00, 7.05480231e+00,
       8.11130831e+00, 9.32603347e+00, 1.07226722e+01, 1.23284674e+01,
       1.41747416e+01, 1.62975083e+01, 1.87381742e+01, 2.15443469e+01,
      

In [62]:
ridge_cv = RidgeCV(alphas=alpha_values, store_cv_values=True)
ridge_cv.fit(xtrain,ytrain)



In [63]:
ridge_pred = ridge_cv.predict(xtest)

In [64]:
ridge_cv.alpha_

np.float64(0.49770235643321115)

In [65]:
mean_squared_error(ytest, ridge_pred)

0.019544354076968272

In [66]:
ridge_cv.coef_

array([0.2916599 , 0.0894537 , 0.30197724, 0.13122731, 0.14923577])

In [67]:
lasso_cv = LassoCV(alphas=alpha_values,cv=5,random_state=0)
lasso_cv.fit(xtrain, ytrain)

In [68]:
lasso_pred = lasso_cv.predict(xtest)

In [69]:
lasso_cv.alpha_

np.float64(0.01)

In [70]:
lasso_cv.coef_

array([0.        , 0.        , 0.13504156, 0.07345709, 0.07836195])

In [71]:
mean_squared_error(ytest, lasso_pred)

0.01991781625293486

## 6. STACKING

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

In [73]:
data = pd.read_csv('liver_patient.csv')
y = data.liver_disease
data.drop(['Age', 'Gender', 'liver_disease'], axis=1, inplace=True)

In [74]:
MM = preprocessing.MinMaxScaler()
X1 = MM.fit_transform(data)
x = pd.DataFrame(X1[:, 0:8])
xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size=0.10, random_state=0)

In [75]:
DT = DecisionTreeClassifier()
BC = BaggingClassifier(n_estimators=10, random_state=0)
PC = BaggingClassifier(n_estimators=10, bootstrap=True, random_state=0)
RFC = RandomForestClassifier(n_estimators=10, max_features="sqrt", random_state=0)
ABC = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=500, random_state=0)

DT.fit(xtrain, ytrain)
BC.fit(xtrain, ytrain)
PC.fit(xtrain, ytrain)
RFC.fit(xtrain, ytrain)
ABC.fit(xtrain, ytrain)

pred_DT = DT.predict(xtest)
pred_BC = BC.predict(xtest)
pred_PC = PC.predict(xtest)
pred_RFC = RFC.predict(xtest)
pred_ABC = ABC.predict(xtest)

In [76]:
print("Decision Tree Accuracy:",accuracy_score(ytest,pred_DT))
print("Bagging Accuracy:",accuracy_score(ytest,pred_BC))
print("Pasting Accuracy:",accuracy_score(ytest,pred_PC))
print("Random Forest Accuracy:",accuracy_score(ytest,pred_RFC))
print("AdaBoost Accuracy:",accuracy_score(ytest,pred_ABC))

Decision Tree Accuracy: 0.6779661016949152
Bagging Accuracy: 0.7288135593220338
Pasting Accuracy: 0.7288135593220338
Random Forest Accuracy: 0.7457627118644068
AdaBoost Accuracy: 0.7457627118644068


In [77]:
estimators=[('dt',DT),('bc',BC),('pc',PC),('rfc',RFC),('abc',ABC)]
stk=StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), passthrough=True)
stk.fit(xtrain,ytrain)
pred_stk=stk.predict(xtest)
print("Stacking Accuracy:",accuracy_score(ytest,pred_stk))

Stacking Accuracy: 0.7457627118644068
