# Glass Classification using Random Forest Classifier with Missing Value Handling

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv('/mnt/data/glass.csv')

# Display first few rows of the dataset
data.head()
    


### Handling Missing Values:
We will check and handle any missing values in the dataset, particularly in the target column `Type`.
    

In [None]:

# Checking for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Drop rows with missing values in the target column 'Type'
data = data.dropna(subset=['Type'])

# Confirm no more missing values in the target column
print("Missing values after handling:")
print(data.isnull().sum())
    


### Exploratory Data Analysis (EDA) and Feature Engineering
We will visualize the distributions of numeric features, check for outliers, and apply feature scaling.
    

In [None]:

# 1. Plot histograms for numerical columns
data.hist(bins=15, figsize=(12, 10))
plt.suptitle('Distribution of Numeric Features')
plt.show()

# 2. Boxplot for 'RI', 'Na', 'Mg', 'AI', and other features (outlier detection)
plt.figure(figsize=(12, 6))
sns.boxplot(x=data['RI'])
plt.title('Boxplot of Refractive Index (RI)')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x=data['Na'])
plt.title('Boxplot of Sodium (Na)')
plt.show()

# 3. Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Feature Scaling (Standardization)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('Type', axis=1))

# Create a new DataFrame for the scaled features
scaled_data = pd.DataFrame(scaled_features, columns=data.columns[:-1])
scaled_data['Type'] = data['Type']
scaled_data.head()
    


### Random Forest Classifier:
We will split the cleaned data into training and testing sets, train a Random Forest classifier, and evaluate its performance.
    

In [None]:

# Splitting data into features (X) and target (y)
X = scaled_data.drop('Type', axis=1)
y = scaled_data['Type']

# Splitting into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the performance metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(classification_report(y_test, y_pred))
    