In [1]:
from sklearn.feature_selection import SelectKBest,RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def rfe_best(indep,dep,n):
    log_est=LogisticRegression(max_iter=1000)
    dt_est=DecisionTreeClassifier()
    rf_est=RandomForestClassifier()
    estimators=[log_est,dt_est,rf_est]
    best_features=[]
    
    features_name=[]
    for estimator in estimators:
        selector=RFE(estimator,n_features_to_select=n)
        fit1=selector.fit(indep,dep)
        x=fit1.transform(indep)
        best_features.append(x)
        mask=fit1.get_support(indices=True)
        names=indep.columns[mask]
        features_name.append(names)
    log_x=best_features[0]
    dt_x=best_features[1]
    rf_x=best_features[2]   
    return best_features,features_name,log_x,dt_x,rf_x

def split_scale(rfe_x,dep):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    x_train,x_test,y_train,y_test=train_test_split(rfe_x,dep,test_size=1/3,random_state=0)
    sc=StandardScaler()
    x_train_sc=sc.fit_transform(x_train)
    x_test_sc=sc.transform(x_test)
    return x_train_sc,x_test_sc,y_train,y_test,sc

def accuracy(model,x_train_sc,x_test_sc,y_train,y_test):
    y_pred=model.predict(x_test_sc)
    accuracy=accuracy_score(y_pred,y_test)
    return accuracy
    

def logistic(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.linear_model import LogisticRegression
    model=LogisticRegression()
    model.fit(x_train_sc,y_train)
    log_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return log_accu,model

def svml(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.svm import SVC
    model=SVC(kernel='linear')
    model.fit(x_train_sc,y_train)
    svml_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return svml_accu,model

def svm_nl(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.svm import SVC
    model=SVC(kernel='rbf')
    model.fit(x_train_sc,y_train)
    svmnl_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return svmnl_accu,model

def naive(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.naive_bayes import GaussianNB
    model=GaussianNB()
    model.fit(x_train_sc,y_train)
    naive_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return naive_accu,model
   

def knn(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    model=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    model.fit(x_train_sc,y_train)
    knn_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return knn_accu,model

def Decision(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.tree import DecisionTreeClassifier
    model= DecisionTreeClassifier(criterion='entropy')
    model.fit(x_train_sc,y_train)
    decision_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return decision_accu,model

def random_forest(x_train_sc,x_test_sc,y_train,y_test):
    from sklearn.ensemble import RandomForestClassifier
    model= RandomForestClassifier(n_estimators=10,criterion='entropy')
    model.fit(x_train_sc,y_train)
    rf_accu=accuracy(model,x_train_sc,x_test_sc,y_train,y_test)
    return rf_accu,model

def best_combo(indep,dep,n):
    best_rfe,column,log_x,dt_x,rf_x=rfe_best(indep,dep,n)
    log_accuracy=[]
    svml_accuracy=[]
    svmnl_accuracy=[]
    naive_accuracy=[]
    knn_accuracy=[]
    dt_accuracy=[]
    rf_accuracy=[]
    for rfe_x in best_rfe:
        x_train_sc,x_test_sc,y_train,y_test,sc=split_scale(rfe_x,dep)
        log_accu,log_model=logistic(x_train_sc,x_test_sc,y_train,y_test)
        log_accuracy.append(log_accu)
        svml_accu,svml_model=svml(x_train_sc,x_test_sc,y_train,y_test)
        svml_accuracy.append(svml_accu)
        svmnl_accu,svmnl_model=svm_nl(x_train_sc,x_test_sc,y_train,y_test)
        svmnl_accuracy.append(svmnl_accu)
        naive_accu,naive_model=naive(x_train_sc,x_test_sc,y_train,y_test)
        naive_accuracy.append(naive_accu)
        knn_accu,knn_model=knn(x_train_sc,x_test_sc,y_train,y_test)
        knn_accuracy.append(knn_accu)
        decision_accu,decision_model=Decision(x_train_sc,x_test_sc,y_train,y_test)
        dt_accuracy.append(decision_accu)
        rf_accu,rf_model=random_forest(x_train_sc,x_test_sc,y_train,y_test)
        rf_accuracy.append(rf_accu)
    result=[log_accuracy,svml_accuracy,svmnl_accuracy,naive_accuracy,knn_accuracy,dt_accuracy,rf_accuracy]
    import pandas as pd
    accuracy_data_frame=pd.DataFrame(result,columns=['Logistic','Decision Tree','Random Forest'],index=['Logistic','SVML','SVMNL','Naive','KNN','Decision Tree','Random Forest'])
    selected_column_name=pd.DataFrame(column,index=['Logistic','Decision Tree','Random Forest'])
    return result,accuracy_data_frame,selected_column_name,sc,log_model,svml_model,svmnl_model,naive_model,knn_model,decision_model,rf_model,log_x,dt_x,rf_x




In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
dataset=pd.read_csv("CKD.csv")
dataset=pd.get_dummies(dataset,drop_first=True)
indep=dataset.drop('classification_yes',axis=1)
dep=dataset['classification_yes']

In [5]:
result,accuracy_data_frame,selected_column_name,sc,log_model,svml_model,svmnl_model,naive_model,knn_model,decision_model,rf_model,log_x,dt_x,rf_x=best_combo(indep,dep,5)


In [6]:
accuracy_data_frame

Unnamed: 0,Logistic,Decision Tree,Random Forest
Logistic,0.977444,0.992481,0.947368
SVML,0.977444,0.992481,0.969925
SVMNL,0.977444,0.984962,0.969925
Naive,0.977444,0.93985,0.909774
KNN,0.977444,0.992481,0.947368
Decision Tree,0.977444,0.969925,0.969925
Random Forest,0.977444,0.977444,0.954887


In [7]:
selected_column_name

Unnamed: 0,0,1,2,3,4
Logistic,al,sg_c,sg_d,htn_yes,dm_yes
Decision Tree,hrmo,rc,sg_c,sg_d,htn_yes
Random Forest,bgr,sc,hrmo,pcv,rc


In [8]:
# log columns
import pandas as pd
dataset=pd.read_csv("CKD.csv")
dataset=pd.get_dummies(dataset,drop_first=True)
indep_log=log_x
dep=dataset['classification_yes']

In [9]:
from sklearn.preprocessing import StandardScaler
x_train,x_test,y_train,y_test=train_test_split(log_x,dep,test_size=1/3,random_state=0)
#scaler=StandardScaler()
x_train_sc=sc.fit_transform(x_train)
x_test_sc=sc.transform(x_test)

In [10]:
new_log_model=LogisticRegression()
new_log_model.fit(x_train_sc,y_train)
y_pred=new_log_model.predict(x_test_sc)
score=accuracy_score(y_test,y_pred)
score

0.9774436090225563

In [11]:
train_score=new_log_model.score(x_train_sc,y_train)
test_score=new_log_model.score(x_test_sc,y_test)
print("Training Score :{}\nTest Score: {}".format(train_score,test_score))
score_diff=train_score-test_score
print("Score Difference:",score_diff)
if score_diff<=0.1:
    print("Model Trained Well on both training and test set")
elif score_diff>0.1:
    print("The Model is overfitted")
else:
    print("The Model is Underffitted")

Training Score :0.9661654135338346
Test Score: 0.9774436090225563
Score Difference: -0.011278195488721776
Model Trained Well on both training and test set


In [12]:
# Decision Tree Model

In [13]:
indep_dt=dt_x
dep=dataset['classification_yes']
from sklearn.preprocessing import StandardScaler
x_train,x_test,y_train,y_test=train_test_split(indep_dt,dep,test_size=1/3,random_state=0)
scaler=StandardScaler()
x_train_sc=scaler.fit_transform(x_train)
x_test_sc=scaler.transform(x_test)

In [14]:
new_model_dt=DecisionTreeClassifier()
new_model_dt.fit(x_train_sc,y_train)

In [15]:
y_pred=new_model_dt.predict(x_test_sc)
score=accuracy_score(y_test,y_pred)
score

0.9699248120300752

In [16]:
#al 	sg_c 	sg_d 	htn_yes 	dm_yes

In [17]:
al=float(input("al:"))
sg_c=float(input("sg_c:"))
sg_d=float(input("sg_d:"))
htn_yes=float(input("htn_yes:"))
wc=float(input("wc:"))
pre_input = [[al, sg_c, sg_d, htn_yes, wc]]
input_sc=sc.transform(pre_input)
new_prediction=new_log_model.predict(input_sc)
print(new_prediction)

al: 1
sg_c: 1
sg_d: 1
htn_yes: 1
wc: 1


[ True]


In [18]:
new_log_model

In [19]:
import joblib
filename='RFE_CKD_log.joblib'
joblib.dump(new_log_model,filename)

['RFE_CKD_log.joblib']

In [20]:
filename='log_CKD_sc.joblib'
joblib.dump(sc,filename)

['log_CKD_sc.joblib']