In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , accuracy_score
from sklearn.model_selection import train_test_split
import math
import csv

In [13]:
np.random.seed(42)
random.seed(42)

In [None]:
data = pd.read_csv('processed.cleveland.data' , header=None)
data.shape


(303, 14)

In [15]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [None]:
all_columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target']
data.columns = all_columns
data.head()

In [18]:
data.shape

(303, 14)

In [19]:
data = data[~data.isin(['?']).any(axis=1)]
print(data.shape)
data['target'].value_counts()

(297, 14)


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,160
1,54
2,35
3,35
4,13


In [20]:
data['target'] = data['target'].replace([1,2,3,4],1)
data['target'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = data['target'].replace([1,2,3,4],1)


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,160
1,137


In [21]:
def scaling(x_train,x_test):
  columns = x_train.columns
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.transform(x_test)

  x_train = pd.DataFrame(x_train,columns=columns)
  x_test = pd.DataFrame(x_test,columns=columns)

  return x_train,x_test

In [22]:
def data_split(data , features):
  x = data[features]
  y = data['target']

  x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)
  return x_train , x_test , y_train , y_test

In [23]:
def pipeline(no_of_features , all_columns , data , n_estimators = 100 , max_depth=None):
  features = all_columns.copy()
  features.remove('target')
  random.shuffle(features)

  selected_features = features[:no_of_features]
  x_train , x_test , y_train , y_test = data_split(data,selected_features)
  x_train , x_test = scaling(x_train,x_test)

  classifier = RandomForestClassifier(n_estimators=n_estimators , max_depth = max_depth , random_state=42)
  classifier.fit(x_train,y_train)
  y_test_pred = classifier.predict(x_test)
  y_train_pred = classifier.predict(x_train)

  return accuracy_score(y_test,y_test_pred) , accuracy_score(y_train,y_train_pred) , selected_features


In [None]:
filename = "results_binary.csv"
header = ['n_estimators', 'max_depth', 'no_of_features', 'train_accuracy', 'test_accuracy', 'selected_features']

with open(filename, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()

with open(filename, mode='a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)

    for j in range(1, 122, 10):
        for k in range(1, 11):
            for m in range(1, len(all_columns)):
                for i in range(20):
                    test_acc, train_acc, selected_subset = pipeline(m, all_columns, data , n_estimators=j, max_depth=k)
                    writer.writerow({
                        'n_estimators': j,
                        'max_depth': k,
                        'no_of_features': m,
                        'train_accuracy': train_acc,
                        'test_accuracy': test_acc,
                        'selected_features': selected_subset
                    })

print("Results have been appended to", filename)