## RANDOM FOREST vs DECISION TREE

In [136]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree, Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split #Function to split dataset into training set and test set
from sklearn import metrics #Metric module for accuracy calculations

In [137]:
#Import the csv file

columns = ["pregnancies", "glucose", "bp", 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima_dataset = pd.read_csv("diabetes.csv", header=None, names=columns)

#eliminate the first row
pima_dataset = pima_dataset.iloc[1:]

pima_dataset.head()

Unnamed: 0,pregnancies,glucose,bp,skin,insulin,bmi,pedigree,age,label
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0
5,0,137,40,35,168,43.1,2.288,33,1


In [138]:
#divide columns into features and target variable
#y = target
#x = features
y = pima_dataset.label
x = pima_dataset[[col for col in pima_dataset.columns if col != 'label']]

#replace NaN values with 0
x = x.fillna(0)


#split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [139]:
#DECISION TREE
#Create Decision Tree classifer object
dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth=6, random_state=42)
dt_clf.fit(x_train, y_train)
y_pred_dt = dt_clf.predict(x_test)


In [140]:
#RANDOM FOREST
#Create Random Forest classifer object
rf_clf = RandomForestClassifier(n_estimators=150, random_state=42, max_depth=20, min_samples_split=2, min_samples_leaf=1)
rf_clf.fit(x_train, y_train)
y_pred_rf = rf_clf.predict(x_test)

In [141]:
#COMPARE THE ACCURACY OF THE TWO MODELS
print("Decision Tree Accuracy:", metrics.accuracy_score(y_test, y_pred_dt))
print("Random Forest Accuracy:", metrics.accuracy_score(y_test, y_pred_rf))

#CLASSIFICATION REPORT
print("Decision Tree Classification Report:")
print(metrics.classification_report(y_test, y_pred_dt))
print("Random Forest Classification Report:")
print(metrics.classification_report(y_test, y_pred_rf))

Decision Tree Accuracy: 0.7619047619047619
Random Forest Accuracy: 0.8051948051948052
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.88      0.82       146
           1       0.73      0.55      0.63        85

    accuracy                           0.76       231
   macro avg       0.75      0.72      0.73       231
weighted avg       0.76      0.76      0.75       231

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       146
           1       0.79      0.64      0.71        85

    accuracy                           0.81       231
   macro avg       0.80      0.77      0.78       231
weighted avg       0.80      0.81      0.80       231



## GRID SEARCH FOR MAXIMIZING THE PERFORMANCE OF THE RANDOM FOREST CLASSIFIER

In [142]:
from sklearn.model_selection import GridSearchCV

''' 
Grid Search for maximizing:
1) num of estimators
2) max depth
3) min samples split
4) min samples leaf
'''

rf = RandomForestClassifier(random_state=42)

grid = {'n_estimators': [50, 70, 90, 100],
        'max_depth': [None, 6, 8, 10, 15],
        'min_samples_split': [2, 3, 4],
        'min_samples_leaf': [1, 2]}

grid_search = GridSearchCV(estimator=rf, param_grid=grid, cv=5, n_jobs=-1, verbose=2)

In [143]:
# Fit del grid search sui dati di addestramento
grid_search.fit(x_train, y_train)

# Visualizzazione dei migliori parametri trovati
print("The best parameters are:", grid_search.best_params_)

# Predizione usando il miglior modello trovato
y_pred = grid_search.best_estimator_.predict(x_test)

# Calcolare e stampare l'accuratezza
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

#classification report
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
The best parameters are: {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 50}
Accuracy: 0.8095238095238095
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.91      0.86       146
           1       0.81      0.64      0.71        85

    accuracy                           0.81       231
   macro avg       0.81      0.77      0.78       231
weighted avg       0.81      0.81      0.80       231

