In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd

In [2]:
# Load your data
data = pd.read_csv(r'D:\mini project\sem VI\project\bot\data\raw\data.csv')

# Apply One-Hot Encoding (if needed)
data = pd.get_dummies(data, drop_first=True)

# Split data into features (X) and target variable (y)
X = data.iloc[:, :-1].values  # All columns except the last one
y = data.iloc[:, 20].values   # Last column is the label

In [3]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Create a Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV to find best parameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],  # Limiting depth to prevent overfitting
    'min_samples_split': [2, 5, 10],  # Increase this to avoid overfitting
    'min_samples_leaf': [1, 2, 4],  # Increase this to avoid overfitting
    'max_features': [None, 'sqrt', 'log2']  # Limit features considered at each split
}

In [5]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5, verbose=1)
grid_search.fit(X_train_scaled, y_train)

# Best parameters from GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Predict using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Data: {accuracy * 100:.2f}%")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy on Test Data: 100.00%


In [6]:
# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[ 48   0   0   0   0   0]
 [  0  33   0   0   0   0]
 [  0   0   9   0   0   0]
 [  0   0   0 125   0   0]
 [  0   0   0   0  15   0]
 [  0   0   0   0   0  10]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00       125
           4       1.00      1.00      1.00        15
           5       1.00      1.00      1.00        10

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240

