In [15]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

def create_pipeline(numerical_features, categorical_features):
    # Create pipeline for preprocessing numerical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', StandardScaler()),  # Scale numerical features
        ('selector', SelectKBest(k='all'))  # Select features
    ])

    # Create pipeline for preprocessing categorical features
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
    ])

    # Combine preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Create full pipeline including preprocessing and model training
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier())  # Use DecisionTreeClassifier
    ])

    return full_pipeline

def ratio(array):
  class_counts = array.value_counts()  # Count occurrences of each class
  total_samples = class_counts.sum()  # Total number of samples in the dataset

  # Calculate percentages for each class
  percentage_0 = (class_counts[0] / total_samples) * 100
  percentage_1 = (class_counts[1] / total_samples) * 100

  return ("Percentage of class '0': {:.2f}%".format(percentage_0)),("Percentage of class '1': {:.2f}%".format(percentage_1))

def genpipeline(array):
  pipe = Pipeline(steps=['ratio', ratio(array)])
  return pipe
# Example usage:
# Define categorical and numerical features



In [8]:
import pandas as pd
df=pd.read_csv('/content/heart.csv')
print(df)

      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0      52    1   0       125   212    0        1      168      0      1.0   
1      53    1   0       140   203    1        0      155      1      3.1   
2      70    1   0       145   174    0        1      125      1      2.6   
3      61    1   0       148   203    0        1      161      0      0.0   
4      62    0   0       138   294    1        1      106      0      1.9   
...   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
1020   59    1   1       140   221    0        1      164      1      0.0   
1021   60    1   0       125   258    0        0      141      1      2.8   
1022   47    1   0       110   275    0        0      118      1      1.0   
1023   50    0   0       110   254    0        0      159      0      0.0   
1024   54    1   0       120   188    0        1      113      0      1.4   

      slope  ca  thal  target  
0         2   2     3       0  
1         0

In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])  # Features
y = df['target']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets to verify the split
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (820, 13) (820,)
Testing data shape: (205, 13) (205,)


In [10]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

val = create_pipeline(numerical_features,categorical_features)

In [12]:
val.fit(X_train,y_train)
accu = val.predict(X_test)
print(accu)



[1 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 0
 1 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1
 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0
 1 1 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1
 1 1 0 1 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 0]


In [19]:
gen = genpipeline(pd.DataFrame(accu))
print(gen)

Pipeline(steps=['ratio',
                ("Percentage of class '0': 51.22%",
                 "Percentage of class '1': 48.78%")])


In [20]:
score = val.score(X_test,y_test)
print(score)

0.9853658536585366
