In [3]:

import pandas as pd

# Assuming glass.xlsx is in the current working directory.
# If it's in a different location, provide the full path.
try:
  df = pd.read_excel('glass.xlsx')
  print(df.head()) # Display the first few rows to confirm successful load
except FileNotFoundError:
  print("Error: 'glass.xlsx' not found. Please ensure the file exists in the correct location.")
except Exception as e:
  print(f"An error occurred: {e}")

  Prepare a model for glass classification using Random Forest
0                                  Data Description:          
1                              RI : refractive index          
2  Na: Sodium (unit measurement: weight percent i...          
3                                      Mg: Magnesium          
4                                       AI: Aluminum          


In [19]:
#  1. Exploratory Data Analysis (EDA):
# Perform exploratory data analysis to understand the structure of the dataset.
# Check for missing values, outliers, inconsistencies in the data.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

try:
  df = pd.read_excel('glass.xlsx')
  print(df.head()) # Display the first few rows to confirm successful load

  # Check for missing values
  print("\nMissing Values:")
  print(df.isnull().sum())

  # Summary statistics
  print("\nSummary Statistics:")
  print(df.describe())

  # Data types of each column
  print("\nData Types:")
  print(df.dtypes)

  # Explore potential outliers using box plots
  print("\nBox Plots for Numerical Features:")
  numerical_cols = df.select_dtypes(include=['number']).columns
  for col in numerical_cols:
      plt.figure(figsize=(8, 6))
      sns.boxplot(x=df[col])
      plt.title(f'Box Plot of {col}')
      plt.show()


  # Check for inconsistencies (e.g., unique values in categorical columns)
  print("\nUnique Values in Categorical Columns:")
  categorical_cols = df.select_dtypes(include=['object']).columns  # Identify object type columns
  for col in categorical_cols:
      print(f"\nColumn: {col}")
      print(df[col].value_counts())

except FileNotFoundError:
  print("Error: 'glass.xlsx' not found. Please ensure the file exists in the correct location.")
except Exception as e:
  print(f"An error occurred: {e}")


  Prepare a model for glass classification using Random Forest
0                                  Data Description:          
1                              RI : refractive index          
2  Na: Sodium (unit measurement: weight percent i...          
3                                      Mg: Magnesium          
4                                       AI: Aluminum          

Missing Values:
Prepare a model for glass classification using Random Forest    1
dtype: int64

Summary Statistics:
       Prepare a model for glass classification using Random Forest
count                                                  18          
unique                                                 18          
top                                     Data Description:          
freq                                                    1          

Data Types:
Prepare a model for glass classification using Random Forest    object
dtype: object

Box Plots for Numerical Features:

Unique Values in Categorical Co

In [21]:
#  2: Data Visualization:
# Create visualizations such as histograms, box plots, or pair plots to visualize the distributions and relationships between features.
# Analyze any patterns or correlations observed in the data.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


try:
    df = pd.read_excel('glass.xlsx')

    # Histograms
    print("\nHistograms for Numerical Features:")
    numerical_cols = df.select_dtypes(include=['number']).columns
    for col in numerical_cols:
        plt.figure(figsize=(8, 6))
        sns.histplot(df[col], kde=True)  # Added KDE for better visualization
        plt.title(f'Histogram of {col}')
        plt.show()

    # Pair plots
    print("\nPair Plot for Numerical Features:")
    sns.pairplot(df[numerical_cols], hue='Type', diag_kind='kde') #hue for better visualization
    plt.show()


    # Box plots (already present in your code)
    print("\nBox Plots for Numerical Features:")
    for col in numerical_cols:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=df[col])
        plt.title(f'Box Plot of {col}')
        plt.show()

except FileNotFoundError:
    print("Error: 'glass.xlsx' not found. Please ensure the file exists in the correct location.")
except Exception as e:
    print(f"An error occurred: {e}")



Histograms for Numerical Features:

Pair Plot for Numerical Features:
An error occurred: No variables found for grid columns.


In [23]:
#3: Data Preprocessing
# 1. Check for missing values in the dataset and decide on a strategy for handling them.Implement the chosen strategy (e.g., imputation or removal) and explain your reasoning.
# 2. If there are categorical variables, apply encoding techniques like one-hot encoding to convert them into numerical format.
# 3. Apply feature scaling techniques such as standardization or normalization to ensure that all features are on a similar scale. Handling the imbalance data.

import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

try:
    df = pd.read_excel('glass.xlsx')

    # 1. Handling Missing Values (if any)
    # Check for missing values
    print("\nMissing Values Before Handling:")
    print(df.isnull().sum())

    # Impute missing values with the mean for numerical features
    numerical_cols = df.select_dtypes(include=['number']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

    print("\nMissing Values After Handling:")
    print(df.isnull().sum())


    # 2. Encoding Categorical Features (if any)
    # One-hot encode the 'Type' column (assuming it's categorical)
    categorical_cols = ['Type']  # Specify your categorical columns here
    ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
    df = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())

    # 3. Feature Scaling
    # Scale numerical features using standardization
    numerical_cols = [col for col in df.columns if col not in ct.get_feature_names_out(categorical_cols)]
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


    # 4. Handling Class Imbalance (if needed)
    # Separate features (X) and target (y)
    X = df.drop(columns=[col for col in df.columns if 'Type' in col]) # Assuming 'Type' columns are generated by OneHotEncoder
    y = df[[col for col in df.columns if 'Type' in col]]

    # Apply SMOTE to oversample the minority classes
    smote = SMOTE(random_state=42) # Use random_state for reproducibility
    X_resampled, y_resampled = smote.fit_resample(X, y)


    # Combine resampled features and target into a new DataFrame
    df_resampled = pd.concat([pd.DataFrame(X_resampled), y_resampled], axis=1)


    print("\nFirst few rows of the preprocessed DataFrame:")
    print(df_resampled.head())

except FileNotFoundError:
    print("Error: 'glass.xlsx' not found.")
except Exception as e:
    print(f"An error occurred: {e}")



Missing Values Before Handling:
Prepare a model for glass classification using Random Forest    1
dtype: int64
An error occurred: at least one array or dtype is required


In [25]:
#  4: Random Forest Model Implementation
# 1. Divide the data into train and test split.
# 2. Implement a Random Forest classifier using Python and a machine learning library like scikit-learn.
# 3. Train the model on the train dataset. Evaluate the performance on test data using metrics like accuracy, precision, recall, and F1-score.

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 5. Random Forest Model Training and Evaluation
rf_classifier = RandomForestClassifier(random_state=42) # You can tune hyperparameters here
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted') # Use 'weighted' for multi-class
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9473684210526315
Precision: 0.9475975861278741
Recall: 0.9473684210526315
F1-score: 0.9474488711132564

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93        64
           1       0.96      0.95      0.96       107

    accuracy                           0.95       171
   macro avg       0.94      0.95      0.94       171
weighted avg       0.95      0.95      0.95       171



In [27]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load example dataset (replace with your own data if needed)
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Bagging
bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_classifier.fit(X_train_resampled, y_train_resampled)
y_pred_bagging = bagging_classifier.predict(X_test)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f"Bagging Accuracy: {accuracy_bagging}")
print(f"Bagging Classification Report:\n{classification_report(y_test, y_pred_bagging)}")

# AdaBoost
ada_boost_classifier = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_boost_classifier.fit(X_train_resampled, y_train_resampled)
y_pred_adaboost = ada_boost_classifier.predict(X_test)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
print(f"AdaBoost Accuracy: {accuracy_adaboost}")
print(f"AdaBoost Classification Report:\n{classification_report(y_test, y_pred_adaboost)}")

# Gradient Boosting
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_classifier.fit(X_train_resampled, y_train_resampled)
y_pred_gb = gb_classifier.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {accuracy_gb}")
print(f"Gradient Boosting Classification Report:\n{classification_report(y_test, y_pred_gb)}")

# Comparing the results
print("\nComparison of Accuracy:")
print(f"Bagging: {accuracy_bagging}")
print(f"AdaBoost: {accuracy_adaboost}")
print(f"Gradient Boosting: {accuracy_gb}")


Bagging Accuracy: 0.9473684210526315
Bagging Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93        64
           1       0.98      0.93      0.96       107

    accuracy                           0.95       171
   macro avg       0.94      0.95      0.94       171
weighted avg       0.95      0.95      0.95       171





AdaBoost Accuracy: 0.9532163742690059
AdaBoost Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        64
           1       0.96      0.96      0.96       107

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171

Gradient Boosting Accuracy: 0.9649122807017544
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        64
           1       0.96      0.98      0.97       107

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171


Comparison of Accuracy:
Bagging: 0.9473684210526315
AdaBoost: 0.9532163742690059
Gradient Boosting: 0.9649122807017544


In [15]:
# Additional Notes:
# 1. Explain Bagging and Boosting methods. How is it different from each other.
# 2. Explain how to handle imbalance in the data.

# Explanations of Bagging and Boosting, and handling class imbalance

# 1. Bagging and Boosting

# Bagging (Bootstrap Aggregating):
# - Multiple base learners (e.g., decision trees) are trained on different subsets of the training data, created by random sampling with replacement (bootstrapping).
# - Each base learner makes predictions, and the final prediction is an aggregate (usually the average or majority vote) of the individual predictions.
# - Reduces variance by averaging out the errors of individual models.
# - Example: Random Forest.

# Boosting:
# - Base learners are trained sequentially.
# - Each subsequent learner focuses on the data points that the previous learners misclassified.
# - Data points are weighted, with more weight given to misclassified points.
# - Reduces bias by combining weak learners into a strong learner.
# - Example: AdaBoost, Gradient Boosting Machines (GBM), XGBoost, LightGBM, CatBoost.


# Differences:
# - Data sampling: Bagging uses random sampling with replacement, while boosting assigns weights to data points.
# - Learner training: Bagging trains learners independently in parallel, while boosting trains them sequentially.
# - Error handling: Bagging reduces variance by averaging out errors, while boosting reduces bias by focusing on misclassifications.


# 2. Handling Class Imbalance

# Class imbalance occurs when one class has significantly more examples than others in a dataset. This can lead to biased models that perform poorly on the minority class.

# Methods to handle imbalance:

# a. Oversampling: Increase the number of examples in the minority class.
#    - SMOTE (Synthetic Minority Over-sampling Technique): Creates synthetic examples by interpolating between existing minority class examples.  Used in the provided code.
#    - Random oversampling: Duplicates existing minority class examples.

# b. Undersampling: Decrease the number of examples in the majority class.
#    - Random undersampling: Randomly removes majority class examples.
#    - NearMiss: Selects majority class examples based on their distance to minority class examples.

# c. Hybrid methods: Combine oversampling and undersampling.

# d. Cost-sensitive learning: Assign different misclassification costs to different classes.  The model is penalized more heavily for misclassifying the minority class.

# e. Algorithm selection: Choose algorithms that are less sensitive to class imbalance (e.g., decision trees, random forests).


# In the provided code:
# - SMOTE is used for oversampling the minority classes before training the Random Forest Classifier.

# Further considerations:

# - Evaluate using appropriate metrics: Accuracy is not a good metric for imbalanced datasets.  Use precision, recall, F1-score, AUC-ROC, and AUC-PR instead.
# - Experiment with different methods and hyperparameters:  Try different oversampling/undersampling techniques and different algorithms.
# - Cross-validation: Use techniques like stratified k-fold cross-validation to get a better estimate of model performance.


