In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Step 1: Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/dataset3/main/glass.csv"
glass_data = pd.read_csv(url)

In [None]:
# Step 2: Data analysis and preprocessing
print("Dataset Information:")
print(glass_data.info())

In [None]:
print("\nClass Distribution:")
print(glass_data['Type'].value_counts())

In [None]:
# Remove unnecessary columns (Id and non-existing class)
glass_data = glass_data.drop(['Id'], axis=1)

In [None]:
# Step 3: Split the data into features (X) and target variable (y)
X = glass_data.drop('Type', axis=1)  # Features (all columns except 'Type')
y = glass_data['Type']  # Target variable ('Type')

In [None]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 6: Hyperparameter tuning with cross-validation
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

In [None]:
# Best hyperparameters
best_params = grid_search.best_params_
print("\nBest Hyperparameters:")
print(best_params)

In [None]:
# Step 7: Train a Random Forest classifier with the best hyperparameters
rf_classifier = RandomForestClassifier(random_state=42, **best_params)
rf_classifier.fit(X_train_scaled, y_train)

In [None]:
# Step 8: Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

In [None]:
# Step 9: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

In [None]:
# Step 10: Print the results
print("\nModel Performance:")
print("Accuracy:", accuracy)

In [None]:
print("Classification Report:")
print(classification_report)