In [None]:
# data_visualization_glass.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\14 random forest\Random Forest"
file_path = os.path.join(base_path, "glass.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="glass")

In [None]:
# === BASIC INFO ===
print("Shape of dataset:", df.shape)
print("Columns:", list(df.columns))
print("\nGlass Types:", df['Type'].unique())

In [None]:
# Separate features and target
features = df.columns[:-1]
target = 'Type'

In [None]:
# === HISTOGRAMS ===
plt.figure(figsize=(12, 8))
df[features].hist(bins=15, figsize=(12, 8), color='skyblue', edgecolor='black')
plt.suptitle("Feature Distributions - Glass Dataset", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(base_path, "histograms.png"))
plt.close()

In [None]:
# === BOXPLOTS ===
plt.figure(figsize=(15, 10))
for i, col in enumerate(features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[col], color='lightcoral')
    plt.title(col, fontsize=10)
plt.tight_layout()
plt.savefig(os.path.join(base_path, "boxplots.png"))
plt.close()

In [None]:
# === PAIRPLOT (Visualizing Feature Relationships by Glass Type) ===
sns.pairplot(df, hue='Type', palette='husl', diag_kind='hist')
plt.suptitle("Pairplot - Relationships Between Features and Glass Type", y=1.02, fontsize=14)
plt.savefig(os.path.join(base_path, "pairplot.png"))
plt.close()

In [None]:
# === CORRELATION HEATMAP ===
plt.figure(figsize=(10, 8))
corr = df[features].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap - Glass Dataset", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(base_path, "correlation_heatmap.png"))
plt.close()

In [None]:
print("\n✅ Data Visualization Completed.")
print("Visual files saved in:", base_path)
print(" - histograms.png")
print(" - boxplots.png")
print(" - pairplot.png")
print(" - correlation_heatmap.png")

In [None]:
# === BASIC INSIGHTS ===
print("\n--- ANALYSIS INSIGHTS ---")
print("1. Some features like 'K', 'Ba', and 'Fe' have strong skew (many zeros).")
print("2. 'RI', 'Na', 'Mg', and 'Ca' show wider variation across glass types.")
print("3. Pairplot reveals partial separation between certain glass types using 'Mg' and 'Al'.")
print("4. Correlation heatmap shows:")
print("   - 'Al' and 'Mg' are negatively correlated.")
print("   - 'RI' and 'Si' show mild inverse relationship.")
print("   - 'Ca' correlates positively with 'RI' and negatively with 'Al'.")
print("\nThese patterns will help Random Forest identify which elements drive glass classification.")

In [None]:
# preprocessing_glass_randomforest_safe.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
import warnings

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\14 random forest\Random Forest"
file_path = os.path.join(base_path, "glass.xlsx")
processed_path = os.path.join(base_path, "glass_processed.csv")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="glass")

In [None]:
print("✅ Dataset loaded successfully.")
print("Shape:", df.shape)
print("\n--- Missing Values Check ---")
print(df.isnull().sum())

In [None]:
# =========================================================
# 1️⃣ HANDLE MISSING VALUES
# =========================================================
if df.isnull().sum().sum() == 0:
    print("\nNo missing values found — no imputation needed.")
else:
    print("\nMissing values detected. Applying median imputation.")
    df = df.fillna(df.median())

In [None]:
# =========================================================
# 2️⃣ ENCODE CATEGORICAL VARIABLES
# =========================================================
cat_cols = df.select_dtypes(include=['object']).columns
if len(cat_cols) > 0:
    print("\nCategorical columns found:", list(cat_cols))
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
else:
    print("\nNo categorical columns — encoding not required.")

In [None]:
# =========================================================
# 3️⃣ FEATURE SCALING
# =========================================================
X = df.drop(columns=['Type'])
y = df['Type']

In [None]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
print("\nFeature scaling (Standardization) applied successfully.")
print("Mean of scaled features (approx):\n", X_scaled.mean().round(3))
print("Std dev of scaled features (approx):\n", X_scaled.std().round(3))

In [None]:
# =========================================================
# 4️⃣ HANDLE IMBALANCED DATA (try SMOTE, else fallback)
# =========================================================
print("\n--- Target Distribution Before Balancing ---")
print(y.value_counts())

In [None]:
use_smote = False
try:
    from imblearn.over_sampling import SMOTE
    use_smote = True
except Exception as e:
    print("\nNote: imbalanced-learn / SMOTE not available or failed to import.")
    print("Reason:", str(e))
    print("Proceeding without SMOTE (data will remain unbalanced).")

In [None]:
if use_smote:
    # Try SMOTE, but protect against runtime errors (e.g., too few samples for k_neighbors)
    try:
        # For small classes, set k_neighbors to min(3, n_min_class-1)
        from collections import Counter
        class_counts = Counter(y)
        n_min = min(class_counts.values())
        k_neighbors = 3
        if n_min <= 3:
            k_neighbors = max(1, n_min - 1)  # SMOTE requires k_neighbors < n_min, adjust down safely
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_bal, y_bal = smote.fit_resample(X_scaled, y)
        print("\nSMOTE applied successfully.")
        print("--- Target Distribution After SMOTE Balancing ---")
        print(pd.Series(y_bal).value_counts())
        final_X, final_y = X_bal, y_bal
    except Exception as e:
        warnings.warn(f"SMOTE failed at fit_resample: {e}. Proceeding without SMOTE.")
        final_X, final_y = X_scaled, y
else:
    final_X, final_y = X_scaled, y

In [None]:
# =========================================================
# SAVE PROCESSED DATA
# =========================================================
processed_df = pd.concat([pd.DataFrame(final_X, columns=X.columns), pd.Series(final_y, name="Type")], axis=1)
processed_df.to_csv(processed_path, index=False)

In [None]:
print(f"\n✅ Data Preprocessing Completed Successfully.")
print("Processed file saved as:", processed_path)
print("Final shape:", processed_df.shape)

In [None]:
# eda_glass_randomforest.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# === PATH SETUP ===
base_path = r"D:\DATA SCIENCE\ASSIGNMENTS\14 random forest\Random Forest"
file_path = os.path.join(base_path, "glass.xlsx")

In [None]:
# === LOAD DATA ===
df = pd.read_excel(file_path, sheet_name="glass")

In [None]:
# === BASIC STRUCTURE ===
print("Shape of dataset:", df.shape)
print("\n--- Dataset Info ---")
print(df.info())

In [None]:
# === CHECK FOR MISSING VALUES ===
print("\n--- Missing Values ---")
print(df.isnull().sum())

In [None]:
# === CHECK FOR DUPLICATES ===
print("\nDuplicate rows:", df.duplicated().sum())

In [None]:
# === DESCRIPTIVE STATISTICS ===
print("\n--- Summary Statistics ---")
print(df.describe().T)

In [None]:
# === TARGET DISTRIBUTION ===
print("\n--- Target Value Counts (Type) ---")
print(df['Type'].value_counts())

In [None]:
# === HISTOGRAMS ===
num_cols = df.columns[:-1]  # all features except target
df[num_cols].hist(bins=15, figsize=(12, 8))
plt.suptitle("Feature Distributions - Glass Dataset", fontsize=14)
plt.savefig(os.path.join(base_path, "histograms.png"))
plt.close()

In [None]:
# === BOXPLOTS FOR OUTLIERS ===
plt.figure(figsize=(14, 8))
for i, col in enumerate(num_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[col], color='skyblue')
    plt.title(col)
plt.tight_layout()
plt.savefig(os.path.join(base_path, "boxplots.png"))
plt.close()

In [None]:
# === CORRELATION MATRIX ===
corr = df[num_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix - Glass Dataset")
plt.savefig(os.path.join(base_path, "correlation_matrix.png"))
plt.close()

=== PAIRPLOT (OPTIONAL - for visual patterns) ===
sns.pairplot(df, hue='Type')
plt.savefig(os.path.join(base_path, "pairplot.png"))
plt.close()

In [None]:
print("\nEDA completed successfully.")
print("Plots saved in:", base_path)
print("Files created:")
print(" - histograms.png")
print(" - boxplots.png")
print(" - correlation_matrix.png")
# print(" - pairplot.png (optional, uncomment if needed)")