<a href="https://www.kaggle.com/code/easyice/glass-classification-eda-models-logreg-knn-dt-rf?scriptVersionId=235912769" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import  plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

## Basic Summary.

In [None]:
# load data.
df = pd.read_csv('/kaggle/input/glass/glass.csv')
df.head()

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## E.D.A.

In [None]:
plt.figure(figsize=(15,15))
plotnumber=1
for column in df:
    if plotnumber<=10:
        ax = plt.subplot(4,3,plotnumber)
        sns.histplot(df[column], color = 'violet', kde=True)
        plt.xlabel(column)
    plotnumber+=1
plt.tight_layout()

In [None]:
sns.countplot(x='Type', data=df, palette='Greens')

In [None]:
sns.pairplot(df, hue='Type')

In [None]:
plt.figure(figsize=(15,15))
plotnumber=1
for column in df:
    if plotnumber<=10:
        ax = plt.subplot(4,3,plotnumber)
        sns.boxplot(df[column], color = 'violet')
        plt.xlabel(column)
    plotnumber+=1
plt.tight_layout()

In [None]:
# Power transformer(yeo-johnson)
from sklearn.preprocessing import PowerTransformer
col_trans = ['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']
df1 = df.copy()
pt = PowerTransformer(method='yeo-johnson')
transformed=pt.fit_transform(df1[col_trans])

In [None]:
trans_df = pd.DataFrame(transformed, columns=['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'])
trans_df.head()

In [None]:
plt.figure(figsize=(15,15))
plotnumber=1
for column in trans_df:
    if plotnumber<=9:
        ax = plt.subplot(3,3,plotnumber)
        sns.histplot(trans_df[column], color = 'violet')
        plt.xlabel(column)
    plotnumber+=1
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.drop('Type', axis=1).corr(), annot=True, fmt='.2f')

## Train Test Split and Scaling.

In [None]:
X = df.drop('Type', axis=1)
y = df.Type

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
std_scale = StandardScaler()
X_train_std = std_scale.fit_transform(X_train)
X_test_std = std_scale.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_std, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_std, columns=X.columns)

In [None]:
print(X_train_scaled.shape)

In [None]:
print(X_test_scaled.shape)

## Model Building.

### Logistic Regression.

In [None]:
LogReg = LogisticRegression()
LogReg.fit(X_train_scaled, y_train)

In [None]:
y_pred = LogReg.predict(X_test)
y_pred

In [None]:
y_test[:5]

In [None]:
# Evaluation of Logistic Regression.

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
ps = precision_score(y_test, y_pred, average='macro')
cr = classification_report(y_test,y_pred)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

In [None]:
# cross validation score
lr_cv_scores = cross_val_score(LogReg, X, y, cv=5, scoring='accuracy')
print("Logistic Regression-cv scores", lr_cv_scores)
print("Logistic Regression - Cross Validation Accuracy:",lr_cv_scores.mean())

### Decision Tree.

In [None]:
dt_class = DecisionTreeClassifier()
dt_class.fit(X_train, y_train)

In [None]:
y_pred = dt_class.predict(X_test)

In [None]:
# Evaluation of Decision Tree.
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
ps = precision_score(y_test, y_pred, average='macro')
cr = classification_report(y_test,y_pred)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

In [None]:
# Train data accuracy.
y_pred_train = dt_class.predict(X_train)
train_ac = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_ac)

In [None]:
# cross validation score
dt_cv_scores = cross_val_score(dt_class, X, y, cv=5, scoring='accuracy')
print("Decision Tree-cv scores", dt_cv_scores)
print("Decision Tree - Cross Validation Accuracy:",dt_cv_scores.mean())

In [None]:
plt.figure(figsize=(200,100))
tree = plot_tree(decision_tree=dt_class, feature_names=X.columns,class_names=['1', '2', '3', '5', '6', '7'], filled=True)
plt.savefig("DT_Glass20.png")

### Hyperparameter Tuning of Decision Tree.

In [None]:
# Grid Search CV

hyperparameters = {'criterion' : ["gini", "entropy"],
                   'max_depth' : np.arange(2,10),
                   'min_samples_split': np.arange(2,10),
                   "min_samples_leaf" : np.arange(2,10)
                  }
dt_class = DecisionTreeClassifier()
gridcv_dt = GridSearchCV(dt_class, hyperparameters, cv=5)
gridcv_dt.fit(X_train, y_train)

In [None]:
gridcv_dt.best_params_

In [None]:
best_dt = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=2, min_samples_split=3)
best_dt.fit(X_train, y_train)

In [None]:
# accuracy.
y_test_pred = best_dt.predict(X_test)
y_train_pred = best_dt.predict(X_train)
test_ac = accuracy_score(y_test, y_test_pred)
train_ac = accuracy_score(y_train, y_train_pred)
print("Testing Accuracy after Hyperparameter Tuning:",test_ac)
print("Training Accuracy after Hyperparameter Tuning:",train_ac)

In [None]:
# Randomized search CV.

hyperparameters = {'criterion':['gini','entropy'], 'max_depth': np.arange(2,10), 'min_samples_leaf': np.arange(2,10),
                 'min_samples_split': np.arange(2,10)}
dt_class = DecisionTreeClassifier()
random_cv = RandomizedSearchCV(dt_class, hyperparameters, cv = 5)
random_cv.fit(X_train, y_train)

In [None]:
random_cv.best_params_

In [None]:
best_dt1 = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=8, min_samples_split=6)
best_dt1.fit(X_train, y_train)

In [None]:
# accuracy
y_pred_test = best_dt1.predict(X_test)
y_pred_train = best_dt1.predict(X_train)
test_acc = accuracy_score(y_test, y_pred_test)
train_acc = accuracy_score(y_train, y_pred_train)
print("Testing Accuracy after Randomized Search CV:",test_acc)
print("Training Accuracy after Randomized Search CV:",train_acc)

### Pruning.
- Cutting of tree
- Pruning : It reduces the size of decision tree by removing tree that do not provide power to classify.
- Remove weak nodes
- ccp_alpha >>> Cost Complexity Pruning


In [None]:
dt_class = DecisionTreeClassifier(ccp_alpha = 0.0)
dt_class.fit(X_train, y_train)

In [None]:
values = dt_class.cost_complexity_pruning_path(X_train, y_train)
print(len(values))
ccp_alphas = values['ccp_alphas']

In [None]:
len(ccp_alphas)

In [None]:
train_accuracy=[]
test_accuracy=[]
for ccp_alpha in ccp_alphas:
    dt_model = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=10)
    dt_model.fit(X_train, y_train)
    train_accuracy.append(dt_model.score(X_train, y_train))
    test_accuracy.append(dt_model.score(X_test, y_test))
    

In [None]:
len(train_accuracy)

In [None]:
max(train_accuracy)

In [None]:
max(test_accuracy)

In [None]:
test_accuracy.index(max(test_accuracy))

In [None]:
train_accuracy[14]

In [None]:
ccp_alphas[14]

In [None]:
fig, ax = plt.subplots()
ax.plot(ccp_alphas, train_accuracy, label = "Train")
ax.plot(ccp_alphas, test_accuracy, label = "Test")
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuarcy")
ax.legend()
plt.show()

In [None]:
dt_class = DecisionTreeClassifier(ccp_alpha=0.01023391812865497)
dt_class.fit(X_train, y_train)

In [None]:
# accuracy
y_predi_test = dt_class.predict(X_test)
ac_test = accuracy_score(y_test, y_predi_test)
y_predi_train = dt_class.predict(X_train)
ac_train = accuracy_score(y_train, y_predi_train)
print("Accuracy of test after Pruning:",ac_test)
print("Accuracy of train after Pruning:",ac_train)

### KNN.

In [None]:
knn_class = KNeighborsClassifier()
knn_class.fit(X_train_scaled, y_train)

In [None]:
y_pred = knn_class.predict(X_test_scaled)

In [None]:
# Evaluation of KNN.
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
ps = precision_score(y_test, y_pred, average='macro')
cr = classification_report(y_test,y_pred)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

In [None]:
# hyper parameter tuning of KNN.
# Grid Search cv
hyperparameter = {'n_neighbors': np.arange(2,15), 'p': [1,2]}
knn_class = KNeighborsClassifier()
grid_cv = GridSearchCV(knn_class, hyperparameter, cv=5)
grid_cv.fit(X_train_scaled, y_train)

In [None]:
grid_cv.best_score_

In [None]:
grid_cv.best_params_

In [None]:
knn_class = KNeighborsClassifier(n_neighbors=3, p=1)
knn_class.fit(X_train_scaled, y_train)

In [None]:
y_pred_grid = knn_class.predict(X_test_scaled)

In [None]:
# Evaluation of KNN after grid.
cm = confusion_matrix(y_test, y_pred_grid)
acc = accuracy_score(y_test, y_pred_grid)
ps = precision_score(y_test, y_pred_grid, average='macro')
cr = classification_report(y_test,y_pred_grid)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

In [None]:
# Random Search cv
hyperparameter = {'n_neighbors': np.arange(2,15), 'p': [1,2]}
knn_class = KNeighborsClassifier()
random_cv = RandomizedSearchCV(knn_class, hyperparameter, cv=5, scoring='accuracy')
random_cv.fit(X_train_scaled, y_train)

In [None]:
random_cv.best_score_

In [None]:
random_cv.best_params_

In [None]:
knn_class = KNeighborsClassifier(n_neighbors=8, p=1)
knn_class.fit(X_train_scaled, y_train)

In [None]:
y_pred_random = knn_class.predict(X_test_scaled)

In [None]:
# Evaluation of KNN after random cv.
cm = confusion_matrix(y_test, y_pred_random)
acc = accuracy_score(y_test, y_pred_random)
ps = precision_score(y_test, y_pred_random, average='macro')
cr = classification_report(y_test,y_pred_random)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

### Random Forest.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_class = RandomForestClassifier(random_state=42)
rf_class.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_class.predict(X_test)

In [None]:
# Evaluation of random forest.
cm = confusion_matrix(y_test, y_pred_rf)
acc = accuracy_score(y_test, y_pred_rf)
ps = precision_score(y_test, y_pred_rf, average='macro')
cr = classification_report(y_test,y_pred_rf)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

In [None]:
# cross validation
cv_score_rf = cross_val_score(rf_class, X, y, cv=7, scoring='accuracy')
print("Random Forest - CV Scores:",cv_score_rf)
print("Random Forest - Cross Validation Accuracy:",cv_score_rf.mean())

In [None]:
# hyperparameter tuning.
params = {'max_depth': [7,10,20], # maximum depth of trees.
         'n_estimators': [20,50,100], # maximum no of trees.
         'min_samples_split': [2,5,7,10], # minimum no of sample(rows) before splitting.
         'min_samples_leaf': [2,5,7]} # minimum no of sample require in aleaf node.
rf_class = RandomForestClassifier()
grid_CV = GridSearchCV(estimator=rf_class, param_grid = params, cv=5, scoring='accuracy')
grid_CV.fit(X_train, y_train)

In [None]:
grid_CV.best_score_

In [None]:
grid_CV.best_params_

In [None]:
best_rf_model = RandomForestClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=50)
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
ps = precision_score(y_test, y_pred, average='macro')
cr = classification_report(y_test,y_pred)
print('Confusion matrix:\n',cm)
print('----------------------')
print('Accuracy Score:',acc)
print('----------------------')
print('Precision Score:',ps)
print('----------------------')
print('Classification Report:\n',cr)

- Best scores Achieved.  **THANK YOU**!!