In [1]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
# Load dataset
dataset = pd.read_csv("prep.csv")
df = pd.get_dummies(dataset, dtype=int, drop_first=True)

# Define input (independent variables) and output (dependent variable)
X = df.drop('classification_yes', axis=1)
y = df['classification_yes']


In [3]:
# Preprocess features (scaling)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [26]:
X_scaled

array([[-2.91575992e+00,  0.00000000e+00,  1.59943471e+00, ...,
         5.12501930e-01,  2.04494943e+00, -4.20703162e-01],
       [-2.85684653e+00,  0.00000000e+00,  8.37890020e-01, ...,
         5.12501930e-01, -4.89009647e-01, -4.20703162e-01],
       [-2.79793313e+00,  0.00000000e+00,  7.63453321e-02, ...,
         5.12501930e-01, -4.89009647e-01, -4.20703162e-01],
       ...,
       [-4.18604856e-16, -4.79397659e-01,  1.59943471e+00, ...,
        -1.95121217e+00, -4.89009647e-01, -4.20703162e-01],
       [-4.18604856e-16,  1.00481749e+00, -6.85199355e-01, ...,
         5.12501930e-01, -4.89009647e-01,  2.37697286e+00],
       [-4.18604856e-16,  2.62709917e-01, -6.85199355e-01, ...,
         5.12501930e-01, -4.89009647e-01, -4.20703162e-01]])

In [4]:
# Apply RFE with Random Forest
rfe_rf = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)
X_rf_rfe = rfe_rf.fit_transform(X_scaled, y)
selected_features_rf = X.columns[rfe_rf.support_].tolist()
selected_features_rf

['bgr', 'sc', 'hrmo', 'pcv', 'rc']

In [5]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf_rfe, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier()
rf_model.fit(X_train_rf, y_train_rf)
y_pred_rf = rf_model.predict(X_test_rf)
print(f"RandomForest Selected Features: {selected_features_rf}")
print(f"RandomForest Accuracy: {accuracy_score(y_test_rf, y_pred_rf):.2f}")
print(classification_report(y_test_rf, y_pred_rf))

RandomForest Selected Features: ['bgr', 'sc', 'hrmo', 'pcv', 'rc']
RandomForest Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        39
           1       1.00      0.95      0.97        41

    accuracy                           0.97        80
   macro avg       0.98      0.98      0.97        80
weighted avg       0.98      0.97      0.97        80



In [6]:
# Apply RFE with Decision Tree
rfe_dt = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)
X_dt_rfe = rfe_dt.fit_transform(X_scaled, y)
selected_features_dt = X.columns[rfe_dt.support_].tolist()
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_dt_rfe, y, test_size=0.2, random_state=42)
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_dt, y_train_dt)
y_pred_dt = dt_model.predict(X_test_dt)
print(f"\nDecisionTree Selected Features: {selected_features_dt}")
print(f"DecisionTree Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.2f}")
print(classification_report(y_test_dt, y_pred_dt))



DecisionTree Selected Features: ['hrmo', 'rc', 'sg_c', 'sg_d', 'htn_yes']
DecisionTree Accuracy: 0.99
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        39
           1       0.98      1.00      0.99        41

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



In [7]:
# Apply RFE with SVM (using linear kernel)
rfe_svm = RFE(estimator=SVC(kernel="linear"), n_features_to_select=5)
X_svm_rfe = rfe_svm.fit_transform(X_scaled, y)
selected_features_svm = X.columns[rfe_svm.support_].tolist()
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_svm_rfe, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel="linear")
svm_model.fit(X_train_svm, y_train_svm)
y_pred_svm = svm_model.predict(X_test_svm)
print(f"\nSVM Selected Features: {selected_features_svm}")
print(f"SVM Accuracy: {accuracy_score(y_test_svm, y_pred_svm):.2f}")
print(classification_report(y_test_svm, y_pred_svm))



SVM Selected Features: ['al', 'sc', 'hrmo', 'pcv', 'sg_d']
SVM Accuracy: 0.97
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        39
           1       0.98      0.98      0.98        41

    accuracy                           0.97        80
   macro avg       0.97      0.97      0.97        80
weighted avg       0.97      0.97      0.97        80



In [8]:
# Apply RFE with Logistic Regression
rfe_lr = RFE(estimator=LogisticRegression(), n_features_to_select=5)
X_lr_rfe = rfe_lr.fit_transform(X_scaled, y)
selected_features_lr = X.columns[rfe_lr.support_].tolist()
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr_rfe, y, test_size=0.2, random_state=42)
lr_model = LogisticRegression()
lr_model.fit(X_train_lr, y_train_lr)
y_pred_lr = lr_model.predict(X_test_lr)
print(f"\nLogistic Regression Selected Features: {selected_features_lr}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test_lr, y_pred_lr):.2f}")
print(classification_report(y_test_lr, y_pred_lr))



Logistic Regression Selected Features: ['al', 'hrmo', 'pcv', 'sg_c', 'dm_yes']
Logistic Regression Accuracy: 0.99
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        39
           1       0.98      1.00      0.99        41

    accuracy                           0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



In [9]:
# Apply RFE with XGBoost
rfe_xgb = RFE(estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss"), n_features_to_select=5)
X_xgb_rfe = rfe_xgb.fit_transform(X_scaled, y)
selected_features_xgb = X.columns[rfe_xgb.support_].tolist()
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(X_xgb_rfe, y, test_size=0.2, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_xgb)
print(f"\nXGBoost Selected Features: {selected_features_xgb}")
print(f"XGBoost Accuracy: {accuracy_score(y_test_xgb, y_pred_xgb):.2f}")
print(classification_report(y_test_xgb, y_pred_xgb))



XGBoost Selected Features: ['al', 'hrmo', 'sg_c', 'sg_d', 'htn_yes']
XGBoost Accuracy: 0.97
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.95      1.00      0.98        41

    accuracy                           0.97        80
   macro avg       0.98      0.97      0.97        80
weighted avg       0.98      0.97      0.97        80



In [10]:
# Apply RFE with AdaBoost using SAMME algorithm to avoid warning
rfe_ada = RFE(estimator=AdaBoostClassifier(algorithm="SAMME"), n_features_to_select=5)
X_ada_rfe = rfe_ada.fit_transform(X_scaled, y)
selected_features_ada = X.columns[rfe_ada.support_].tolist()
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada_rfe, y, test_size=0.2, random_state=42)
ada_model = AdaBoostClassifier(algorithm="SAMME")
ada_model.fit(X_train_ada, y_train_ada)
y_pred_ada = ada_model.predict(X_test_ada)
print(f"\nAdaBoost Selected Features: {selected_features_ada}")
print(f"AdaBoost Accuracy: {accuracy_score(y_test_ada, y_pred_ada):.2f}")
print(classification_report(y_test_ada, y_pred_ada))



AdaBoost Selected Features: ['al', 'hrmo', 'sg_c', 'sg_d', 'dm_yes']
AdaBoost Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        39
           1       0.95      0.98      0.96        41

    accuracy                           0.96        80
   macro avg       0.96      0.96      0.96        80
weighted avg       0.96      0.96      0.96        80



In [11]:
# Save Decision Tree model, scaler, and selected features
with open('ckd_model_dt.pkl', 'wb') as model_file, open('scaler.pkl', 'wb') as scaler_file, open('selected_features_dt.pkl', 'wb') as feature_file:
    pickle.dump(dt_model, model_file)
    pickle.dump(scaler, scaler_file)
    pickle.dump(selected_features_dt, feature_file)


In [12]:
import pandas as pd
import pickle

# Load the saved Decision Tree model and scaler
with open('ckd_model_dt.pkl', 'rb') as model_file, open('scaler.pkl', 'rb') as scaler_file:
    dt_model = pickle.load(model_file)
    scaler = pickle.load(scaler_file)

In [13]:
# Select the required features for prediction
required_features = ['hrmo', 'rc', 'sg_c', 'sg_d', 'htn_yes']
new_data_selected = df[required_features]  # This will include only the specified columns


In [14]:
new_data_selected

Unnamed: 0,hrmo,rc,sg_c,sg_d,htn_yes
0,12.518156,4.705597,1,0,0
1,10.700000,4.705597,1,0,0
2,12.000000,4.705597,0,0,0
3,8.100000,4.705597,0,1,0
4,11.800000,4.705597,1,0,0
...,...,...,...,...,...
394,12.500000,4.400000,0,0,0
395,8.700000,4.705597,1,0,1
396,9.100000,3.400000,1,0,1
397,8.500000,4.705597,0,0,1


In [19]:
import pandas as pd
import pickle

# Load the saved Decision Tree model, scaler, and selected features
with open('ckd_model_dt.pkl', 'rb') as model_file, open('scaler.pkl', 'rb') as scaler_file:
    dt_model = pickle.load(model_file)
    scaler = pickle.load(scaler_file)

# Example new data (replace with actual values)
new_data = pd.DataFrame({
    'hrmo': [12],   # Replace value_hrmo with the actual value
    'rc': [4],       # Replace value_rc with the actual value
    'sg_c': [1],   # Replace value_sg_c with the actual value
    'sg_d': [0],   # Replace value_sg_d with the actual value
    'htn_yes': [0]  # Replace value_htn with the actual value (0 or 1)
})



In [38]:
# The original features used for fitting the scaler (same order)
required_features = ['hrmo', 'rc', 'sg_c', 'sg_d', 'htn_yes']

In [39]:
# Select only the required features (ensuring they are in the right order)
new_data_selected = new_data[required_features]


In [40]:
#Scale the new data using the fitted scaler
new_data_scaled = scaler.transform(new_data_selected)


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- age
- al
- ane_yes
- appet_yes
- ba_present
- ...
