# Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load data

In [None]:
df = pd.read_csv('glass.csv')
df.head()

In [None]:
# EDA and Data Visualization

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.hist(bins=15, figsize=(15, 10))
plt.suptitle("Histograms of Glass Dataset", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(data=df, palette="Set2")
plt.title("Box Plots of Glass Dataset Features")
plt.xticks(rotation=45)
plt.show()


#Outliers Detection

K, Ba, Fe has Strong presence of outliers, especially with values much greater than the median.

Ca also shows some high-value outliers, though less extreme.

RI, Na, Si Distributions are tighter, indicating more stable measurement ranges.


In [None]:
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


#Ca and K, Al and Na are having Strong positive correlations
Mg with Al, Si are having Negative correlation 


In [None]:
# Data Preprocessing

In [None]:
df.isnull().sum()

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
print(df.dtypes)

In [None]:
df['Type'] = df['Type'].astype(int)

In [None]:
# converted 'Type' to integer for model compatibility

In [None]:
print(df.dtypes)

In [None]:
X = df.drop('Type', axis=1)
y = df['Type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)      

In [None]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

In [None]:
# Handel inbalanced data using SMOTE.

In [None]:
# Random Forest Model Implementation

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred = rf.predict(X_test_scaled)

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))

In [None]:
# Bagging and Boosting Methods

In [None]:
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    random_state=42
)

In [None]:
bagging_model.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred_bag = bagging_model.predict(X_test_scaled)

In [None]:
ada_model = AdaBoostClassifier(
    n_estimators=100,
    random_state=42
)

ada_model.fit(X_train_balanced, y_train_balanced)
y_pred_ada = ada_model.predict(X_test_scaled)

In [None]:
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

gb_model.fit(X_train_balanced, y_train_balanced)
y_pred_gb = gb_model.predict(X_test_scaled)

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n {name} Classification Report")
    print(classification_report(y_true, y_pred, zero_division=0))

evaluate_model("Bagging (Decision Tree)", y_test, y_pred_bag)
evaluate_model("AdaBoost", y_test, y_pred_ada)
evaluate_model("Gradient Boosting", y_test, y_pred_gb)


#Bagging (Decision Tree) gave the best overall performance, with high accuracy and strong F1-score.

Gradient Boosting was a close second, showing better handling of minority classes than AdaBoost.

AdaBoost struggled with small class sizes, making it less suitable for imbalanced multi-class datasets.


# Interview Questions

#1. Explain Bagging and Boosting methods. How is it different from each other.


#Bagging and Boosting are methods that combine multiple models to improve accuracy.
Bagging builds models independently on random data samples to reduce variance, like in Random Forest.
Boosting builds models one after another, each fixing the last model’s mistakes, to reduce bias, like in AdaBoost.
The key difference is Bagging trains in parallel, Boosting trains in sequence.


#2. Explain how to handle imbalance in the data.


#To handle imbalanced data, we can use techniques like
resampling (oversampling the minority class or undersampling the majority), 
SMOTE (synthetic data generation), or apply class weights to give more importance to the minority class. 
These methods help the model learn from all classes more fairly and improve performance on underrepresented classes.
