In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold

# Load the dataset
data = load_iris()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Grid Search object
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform Grid Search with cross-validation
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the Random Forest model with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
best_rf_classifier.fit(X_train, y_train)

# Evaluate the model
accuracy = best_rf_classifier.score(X_test, y_test)
print("Test Accuracy:", accuracy)

# K-fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_rf_classifier, X, y, cv=kfold)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 1.0
Cross-Validation Scores: [1.         0.96666667 0.93333333 1.         0.9       ]
Mean CV Accuracy: 0.9600000000000002


In [26]:
df = pd.read_csv('heart_v2.csv')
df

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0
...,...,...,...,...,...
265,52,1,172,199,0
266,44,1,120,263,0
267,56,0,140,294,0
268,57,1,140,192,0


In [27]:
X = df.iloc[:,:-1].values
y = df.iloc[:,1].values


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
indices = np.random.choice(X_train,size=100,replace=True)
X_bootstrap = X_train[indices]
y_bootstrap = y_train[indices]

ValueError: a must be 1-dimensional

In [30]:
X_train

array([[ 49,   0, 134, 271],
       [ 59,   1, 170, 326],
       [ 53,   1, 130, 246],
       [ 50,   1, 140, 233],
       [ 54,   0, 110, 214],
       [ 59,   1, 160, 273],
       [ 46,   1, 140, 311],
       [ 45,   1, 142, 309],
       [ 58,   1, 132, 224],
       [ 59,   1, 135, 234],
       [ 52,   1, 128, 255],
       [ 51,   1, 140, 261],
       [ 51,   1, 110, 175],
       [ 41,   1, 135, 203],
       [ 41,   1, 110, 172],
       [ 76,   0, 140, 197],
       [ 64,   1, 110, 211],
       [ 67,   1, 120, 237],
       [ 55,   0, 135, 250],
       [ 60,   0, 158, 305],
       [ 69,   1, 160, 234],
       [ 62,   1, 128, 208],
       [ 41,   0, 105, 198],
       [ 58,   1, 100, 234],
       [ 68,   1, 118, 277],
       [ 42,   1, 130, 180],
       [ 51,   0, 130, 256],
       [ 57,   1, 110, 201],
       [ 44,   1, 130, 219],
       [ 54,   1, 124, 266],
       [ 58,   1, 125, 300],
       [ 62,   1, 120, 267],
       [ 54,   1, 125, 273],
       [ 42,   0, 120, 209],
       [ 56,  

In [31]:
X_train.shape[0]

216

In [32]:
n_trees = 10
trees = []
for i in range(n_trees):
    indices = np.random.choice(X_train.shape[0],size=100,replace=True)
    X_bootstrap = X_train[indices]
    y_bootstrap = y_train[indices]
    model = LogisticRegression()
    model.fit(X_bootstrap,y_bootstrap)
    trees.append(model)
    

In [53]:
predictions = [tree.predict(X_test) for tree in trees]
y_pred = np.round(np.mean(predictions, axis=0))
print(y_pred)


[1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1.
 1. 1. 0. 0. 1. 1.]


In [44]:
print(y_test)

[1 1 1 0 0 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 0 0 1 0 1 1 1
 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 1 1]


In [55]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
ac = accuracy_score(y_test,y_pred)
print(cm)
print(ac)


[[22  0]
 [ 0 32]]
1.0
