# Glass Classification using Random Forest Classifier with Enhanced Visualizations

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('/mnt/data/glass.csv')

# Display first few rows of the dataset
data.head()
    


### Handling Missing Values:
We will check and handle any missing values in the dataset, particularly in the target column `Type`.
    

In [None]:

# Checking for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Drop rows with missing values in the target column 'Type'
data = data.dropna(subset=['Type'])

# Confirm no more missing values in the target column
print("Missing values after handling:")
print(data.isnull().sum())
    


### Exploratory Data Analysis (EDA) with Enhanced Visualizations:
We will explore the distributions of numerical variables, visualize relationships between features, and apply scaling techniques.
    

In [None]:

# 1. Pairplot to visualize relationships between numerical variables
sns.pairplot(data, hue='Type', diag_kind='kde')
plt.suptitle('Pairplot of Numerical Features with Type of Glass', y=1.02)
plt.show()

# 2. KDE plots for continuous variables (RI, Na, Mg, etc.)
plt.figure(figsize=(10, 6))
sns.kdeplot(data['RI'], shade=True, label='RI')
sns.kdeplot(data['Na'], shade=True, label='Na')
sns.kdeplot(data['Mg'], shade=True, label='Mg')
plt.title('KDE Plot of Continuous Features')
plt.legend()
plt.show()

# 3. Bar chart for Type distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Type', data=data)
plt.title('Distribution of Glass Types')
plt.show()
    


### Correlation Matrix and Outlier Detection:
We will analyze correlations between features and detect any potential outliers using boxplots.
    

In [None]:

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Boxplot for 'RI', 'Na', 'Mg', and others to detect outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=data['RI'])
plt.title('Boxplot of Refractive Index (RI)')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x=data['Na'])
plt.title('Boxplot of Sodium (Na)')
plt.show()
    


### Feature Scaling:
We will apply standardization to ensure that all features are on the same scale before training the model.
    

In [None]:

# Feature Scaling (Standardization)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('Type', axis=1))

# Create a new DataFrame for the scaled features
scaled_data = pd.DataFrame(scaled_features, columns=data.columns[:-1])
scaled_data['Type'] = data['Type']
scaled_data.head()
    


### Random Forest Classifier:
We will split the cleaned data into training and testing sets, train a Random Forest classifier, and evaluate its performance.
    

In [None]:

# Splitting data into features (X) and target (y)
X = scaled_data.drop('Type', axis=1)
y = scaled_data['Type']

# Splitting into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the performance metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(classification_report(y_test, y_pred))
    


### Feature Importance:
We will visualize the importance of features in the Random Forest model to understand which factors contribute most to the classification.
    

In [None]:

# Feature importance
importances = rf_clf.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
sns.barplot(x=importances[indices], y=X.columns[indices])
plt.title('Feature Importance from Random Forest Classifier')
plt.show()
    


### Bagging and Boosting Methods:
We will apply Bagging and Boosting techniques to improve model performance and compare the results with the original Random Forest classifier.
    

In [None]:

from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

# Bagging Classifier
bagging_clf = BaggingClassifier(base_estimator=rf_clf, n_estimators=50, random_state=42)
bagging_clf.fit(X_train, y_train)
y_bag_pred = bagging_clf.predict(X_test)
bagging_accuracy = accuracy_score(y_test, y_bag_pred)
print(f"Bagging Accuracy: {bagging_accuracy}")

# Boosting Classifier (AdaBoost)
boosting_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
boosting_clf.fit(X_train, y_train)
y_boost_pred = boosting_clf.predict(X_test)
boosting_accuracy = accuracy_score(y_test, y_boost_pred)
print(f"Boosting Accuracy: {boosting_accuracy}")
    