<a href="https://colab.research.google.com/github/Vikkibala007/Newproject/blob/main/all_dataset_accuracy_single_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
import warnings
import os

# Ignore warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Directory containing multiple CSV files
input_directory = "/content/combined data"

# List all CSV files in the directory
csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

for file in csv_files:
    print(f"Processing file: {file}")

    # Load the dataset
    df = pd.read_csv(os.path.join(input_directory, file), header=None, names=['age', 'sex', 'cp', 'trestbps', 'chol', 'FBS', 'restecg', 'thalach', 'exang', 'Old peak', 'slope', 'ca', 'thal', 'num'])

    # Replace '?' with NaN
    df.replace('?', np.nan, inplace=True)

    # Convert columns to numeric
    df = df.apply(pd.to_numeric)

    # Impute missing values with mean
    df.fillna(df.mean(), inplace=True)

    # Define feature columns and target column
    feature_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'FBS', 'restecg', 'thalach', 'exang', 'Old peak', 'slope', 'ca', 'thal']
    target_column = 'num'

    # Separate features and target
    X = df[feature_columns]
    y = df[target_column]

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the scaled dataset into training and testing sets (80% training, 20% testing)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Define models and parameter grids for hyperparameter tuning
    models = {
        "Logistic Regression": (LogisticRegression(max_iter=2000), {'model__C': [0.1, 1, 10]}),
        "Decision Tree": (DecisionTreeClassifier(), {'model__max_depth': [None, 10, 20, 30]}),
        "Random Forest": (RandomForestClassifier(), {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 10, 20]}),  # Added max_depth hyperparameter
        "Naive Bayes": (GaussianNB(), {}),
        "Neural Network": (MLPClassifier(max_iter=1000), {'model__hidden_layer_sizes': [(100,), (50, 50), (20, 20, 20)]}),
        "Support Vector Machine": (SVC(), {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']})
    }

    results = {}

    for model_name, (model, param_grid) in models.items():
        pipeline = Pipeline([
            ('feature_selection', RFE(estimator=DecisionTreeClassifier())),
            ('model', model)
        ])

        # Perform grid search with cross-validation
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        # Best model from grid search
        best_model = grid_search.best_estimator_

        # Make predictions on training set
        y_train_pred = best_model.predict(X_train)

        # Calculate evaluation metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred, average='weighted')
        train_precision = precision_score(y_train, y_train_pred, average='weighted')
        train_recall = recall_score(y_train, y_train_pred, average='weighted')

        results[model_name] = {
            "Accuracy": train_accuracy,
            "F1 Score": train_f1,
            "Precision": train_precision,
            "Recall": train_recall
        }

    # Display results for each file
    print(f"Results for file: {file}")
    for model_name, metrics in results.items():
        print(f"Model: {model_name}")
        for metric_name, value in metrics.items():
            print(f"{metric_name}: {value}")
        print()
    print("------------------------------------")


Processing file: reprocessed.hungarian.csv
Results for file: reprocessed.hungarian.csv
Model: Logistic Regression
Accuracy: 0.6893617021276596
F1 Score: 0.6473931509435616
Precision: 0.6293161072455022
Recall: 0.6893617021276596

Model: Decision Tree
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Model: Random Forest
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Model: Naive Bayes
Accuracy: 0.676595744680851
F1 Score: 0.6541393376465879
Precision: 0.6530325519452862
Recall: 0.676595744680851

Model: Neural Network
Accuracy: 0.8680851063829788
F1 Score: 0.8610541688829204
Precision: 0.8657898130238555
Recall: 0.8680851063829788

Model: Support Vector Machine
Accuracy: 0.7191489361702128
F1 Score: 0.6492174040820036
Precision: 0.7686904376179676
Recall: 0.7191489361702128

------------------------------------
Processing file: processed.cleveland.csv
Results for file: processed.cleveland.csv
Model: Logistic Regression
Accuracy: 0.6157024793388429
F1 Score: 0.51604713931



Results for file: processed.switzerland.csv
Model: Logistic Regression
Accuracy: 0.4897959183673469
F1 Score: 0.4253328991170936
Precision: 0.410760667903525
Recall: 0.4897959183673469

Model: Decision Tree
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Model: Random Forest
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Model: Naive Bayes
Accuracy: 0.19387755102040816
F1 Score: 0.2223612109269649
Precision: 0.36411860029371557
Recall: 0.19387755102040816

Model: Neural Network
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.0

Model: Support Vector Machine
Accuracy: 0.8775510204081632
F1 Score: 0.8761904761904763
Precision: 0.8819241982507288
Recall: 0.8775510204081632

------------------------------------
Processing file: processed.va.csv
Results for file: processed.va.csv
Model: Logistic Regression
Accuracy: 0.425
F1 Score: 0.3880541069100391
Precision: 0.41388526162971734
Recall: 0.425

Model: Decision Tree
Accuracy: 1.0
F1 Score: 1.0
Precision: 1.0
Recall: 1.