In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline

In [47]:
X1 = np.random.randint(1, 11, size=1000)
X2 = np.random.randint(50, 101, size=1000)
X3 = np.random.randint(18, 50, size=1000)
X4 = np.random.randint(1, 6, size=1000)
prob = 1 / (1 + np.exp(-(0.4 * X1 + 0.03 * X2 + 0.1 * X3 + 0.2 * X4 - 10)))
Y = (np.random.rand(1000) < prob).astype(int)

data = pd.DataFrame({
    'Hours_Studied': X1,
    'Previous_Score': X2,
    'Age': X3,
    'Practice_Tests': X4,
    'Pass/Fail': Y
})
data.head()

Unnamed: 0,Hours_Studied,Previous_Score,Age,Practice_Tests,Pass/Fail
0,7,78,34,4,0
1,5,71,37,3,0
2,9,77,30,1,0
3,5,59,37,2,0
4,6,100,41,1,1


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
print(f'{confusion_matrix(y_test, pred)}')
print(f'{classification_report(y_test, pred)}')

[[94 16]
 [12 78]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87       110
           1       0.83      0.87      0.85        90

    accuracy                           0.86       200
   macro avg       0.86      0.86      0.86       200
weighted avg       0.86      0.86      0.86       200



In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
tree = RandomForestClassifier(n_estimators=300, max_depth=5)
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
print(f'{confusion_matrix(y_test, pred)}')
print(f'{classification_report(y_test, pred)}')

[[99  4]
 [19 78]]
              precision    recall  f1-score   support

           0       0.84      0.96      0.90       103
           1       0.95      0.80      0.87        97

    accuracy                           0.89       200
   macro avg       0.90      0.88      0.88       200
weighted avg       0.89      0.89      0.88       200



In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
tree = GradientBoostingClassifier(max_depth=3)
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
print(f'{confusion_matrix(y_test, pred)}')
print(f'{classification_report(y_test, pred)}')

[[91  7]
 [15 87]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.89        98
           1       0.93      0.85      0.89       102

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.89      0.89      0.89       200



Grid search

In [54]:
#Random Forest

forest = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
}

grid_search_rf = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_search_rf.best_params_}")
print(f"Best cross-validation accuracy for Random Forest: {grid_search_rf.best_score_:.4f}")

y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)
print(f"Classification Report for Random Forest:\n{classification_report(y_test, y_pred_rf)}")

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters for Random Forest: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best cross-validation accuracy for Random Forest: 0.9137
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.83      0.93      0.88        98
           1       0.92      0.82      0.87       102

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.87       200
weighted avg       0.88      0.88      0.87       200



In [56]:
#adjusted Random Forest
tree = RandomForestClassifier(n_estimators=100, max_depth=10, max_features='sqrt', min_samples_leaf=2, min_samples_split=5)
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
print(f'{confusion_matrix(y_test, pred)}')
print(f'{classification_report(y_test, pred)}')

[[92  6]
 [19 83]]
              precision    recall  f1-score   support

           0       0.83      0.94      0.88        98
           1       0.93      0.81      0.87       102

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.87       200
weighted avg       0.88      0.88      0.87       200



In [55]:
random_forest_gb = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200], 
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7], 
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=random_forest_gb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

grid_search_rf.fit(X_train, y_train)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best accuracy for Random Forest:", grid_search_rf.best_score_)

best_rf = grid_search_rf.best_estimator_
test_accuracy_rf = best_rf.score(X_test, y_test)
print(f"Test set accuracy for Random Forest: {test_accuracy_rf:.4f}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; 

In [57]:
#Adjusted parameters for Gradient Forest
tree_gbc = GradientBoostingClassifier(learning_rate=0.1, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50)
tree_gbc.fit(X_train, y_train)
pred = tree_gbc.predict(X_test)
print(f'{confusion_matrix(y_test, pred)}')
print(f'{classification_report(y_test, pred)}')

[[91  7]
 [16 86]]
              precision    recall  f1-score   support

           0       0.85      0.93      0.89        98
           1       0.92      0.84      0.88       102

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.88       200
weighted avg       0.89      0.89      0.88       200

