<a href="https://www.kaggle.com/code/sriusairam/w4-chronic-kidney-disease-smote?scriptVersionId=227031964" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 💥 Chronic Kidney Disease

🏆 Problem Statement: Using the data which has 25 features to predict patient with chronic kidney disease

# Libraries 📖

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

from sklearn.impute import SimpleImputer
import pandas as pd
from numpy import isnan
from sklearn.preprocessing import LabelEncoder
from numpy import nan
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# For Model Evaluation
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc 
from matplotlib import pyplot

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# Load data 📁

In [None]:
df_data=pd.read_csv("../input/ckdisease/kidney_disease.csv")


In [None]:
df_data

# Data analysis 📊

In [None]:
df_data.info()

In [None]:
df_data.head(5)

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;">📉 Observation
      <li><b>Observation One:</b>All Column Names</b> are not user-friendly.</li>
      <li><b>Observation Two:</b>Following columns values in numeric but reflect as text column
        <ul>
          <li>pcv (packed_cell_volume)</li>
          <li>wc (white_blood_cell_count)</li>
          <li>rc (red_blood_cell_count)</li>
        </ul>
      </li>
</div>

In [None]:
missing = df_data.isnull().sum()
missing[missing > 0].sort_values(ascending=False).head(20)

In [None]:
# dropping 'id' column
df_data.drop('id', axis = 1, inplace = True)

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"><b>📉 Observation :</b>
    <li><b>Observation Three:</b> Certain columns contain <b>missing data</B> that necessitates our attention and management.</li>
</div>

In [None]:
print(f"dm :- {df_data['dm'].unique()}")
print(f"cad :- {df_data['cad'].unique()}")
print(f"classification :- {df_data['classification'].unique()}")

<div style="border-radius: 10px; border: #0ea5e9 solid; padding: 15px; background-color: #ffffff00; font-size: 100%; text-align: left;"><b>📉 Observation :</b>
    <li><b>Observation Four:</b> There are typo errors in dm (diabetes_mellitus), cad (coronary_artery_disease), and classification (class) columns</li>
    <li><b>Observation Five:</b> Need to change all column with Text value to Numeric</li>
</div>

# Preprocessing ⚙️ 

<li><b>Resolving Observation One:</b> Allocate more user-friendly names to the columns</li>

In [None]:
# Notice the unfriendly column names
df_data.head(3)

In [None]:
df_data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'anemia', 'class']

In [None]:
# Friendly column names allocated
df_data.head(3)

<div style="border-radius: 10px; border: #ff001c solid; padding: 15px; margin-top: 15px; background-color: #ffffff00; font-size: 
            100%; text-align: left;">
    <b> 🚩 Column Names renamed successfully</b>
</div>

<li><b>Resolving Observation Two:</b> Converting text columns (packed_cell_volume, white_blood_cell_count and red_blood_cell_count) to numeric format</li>

In [None]:
# Notice these columns are of datatype Object
text_columns = ['packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count']

for column in text_columns:
    print(f"{column} -: {df_data[column].dtype}")

In [None]:
# Convert text column to numeric column
def convert_text_to_numeric_col (dataframe, feature):
    dataframe[feature] = pd.to_numeric(df_data[feature], errors='coerce')

for column in text_columns:
    convert_text_to_numeric_col(df_data, column)
    print(f"text_columns: {df_data[column].dtype}")

<div style="border-radius: 10px; border: #ff001c solid; padding: 15px; margin-top: 15px; background-color: #ffffff00; font-size: 
            100%; text-align: left;">
    <b> 🚩 Column Names (packed_cell_volume, white_blood_cell_count and red_blood_cell_count) converted to numeric successfully</b>
</div>

<li><b>Resolving Observation Three:</b> Resolving missing data</li>

In [None]:
# Replacing missing values in all numeric columns with mean
'''def mean_value_imputation(dataframe, feature):
    mean_value=dataframe[feature].mean()
    dataframe[feature].fillna(value=mean_value, inplace=True)

# Replacing missing values in all categorical columns with highest frequency data
def impute_mode(dataframe, feature):
    mode = dataframe[feature].mode()[0]
    dataframe[feature] = dataframe[feature].fillna(mode)  '''

In [None]:
# Obtaining columns names of all numerical features
'''num_columns = [col for col in df_data.columns if df_data[col].dtype != 'object']

# Assigning random number to all missing data in numeric columns
for column_name in num_columns:
    mean_value_imputation(df_data,column_name)'''

In [None]:
# Obtaining columns names of all categorized features
'''cat_columns = [col for col in df_data.columns if df_data[col].dtype == 'object']
impute_mode(df_data,"blood_pressure")

# Assigning highest frequency to all missing data in categorical columns
for column_name in cat_columns:
    impute_mode(df_data,column_name)'''

In [None]:
missing = df_data.isnull().sum()
missing[missing > 0].sort_values(ascending=False).head(20)

<div style="border-radius: 10px; border: #ff001c solid; padding: 15px; margin-top: 15px; background-color: #ffffff00; font-size: 
            100%; text-align: left;">
    <b> 🚩 All missing values have been filled up</b>
</div>

<li><b>Resolving Observation Four:</b> Typo Errors in dm (diabetes_mellitus), cad (coronary_artery_disease), and classification (class) columns</li>

In [None]:
print(f"diabetes_mellitus :- {df_data['diabetes_mellitus'].unique()}")
print(f"coronary_artery_disease :- {df_data['coronary_artery_disease'].unique()}")
print(f"class :- {df_data['class'].unique()}")

In [None]:
df_data['diabetes_mellitus'] = df_data['diabetes_mellitus'].replace(to_replace = {' yes':'yes', '\tno':'no', '\tyes':'yes'})
df_data['coronary_artery_disease'] = df_data['coronary_artery_disease'].replace(to_replace = '\tno', value='no')
df_data['class'] = df_data['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

In [None]:
print(f"diabetes_mellitus'] :- {df_data['diabetes_mellitus'].unique()}")
print(f"coronary_artery_disease :- {df_data['coronary_artery_disease'].unique()}")
print(f"class :- {df_data['class'].unique()}")

<div style="border-radius: 10px; border: #ff001c solid; padding: 15px; margin-top: 15px; background-color: #ffffff00; font-size: 
            100%; text-align: left;">
    <b> 🚩 All typo errors for the three columns rectified</b>
</div>

<li><b>Resolving Observation Five:</b> Feature Encoding - Need to change all column with Text value to Numeric</li>

In [None]:
df_data['class'] = df_data['class'].map({'ckd': 1, 'not ckd': 0})
df_data['red_blood_cells'] = df_data['red_blood_cells'].map({'normal': 1, 'abnormal': 0})
df_data['pus_cell'] = df_data['pus_cell'].map({'normal': 1, 'abnormal': 0})
df_data['pus_cell_clumps'] = df_data['pus_cell_clumps'].map({'present': 1, 'notpresent': 0})
df_data['bacteria'] = df_data['bacteria'].map({'present': 1, 'notpresent': 0})
df_data['hypertension'] = df_data['hypertension'].map({'yes': 1, 'no': 0})
df_data['diabetes_mellitus'] = df_data['diabetes_mellitus'].map({'yes': 1, 'no': 0})
df_data['coronary_artery_disease'] = df_data['coronary_artery_disease'].map({'yes': 1, 'no': 0}) 
df_data['appetite'] = df_data['appetite'].map({'good': 1, 'poor': 0})
df_data['peda_edema'] = df_data['peda_edema'].map({'yes': 1, 'no': 0})
df_data['anemia'] = df_data['anemia'].map({'yes': 1, 'no': 0})

In [None]:
for column in text_columns:
    convert_text_to_numeric_col(df_data, column)

In [None]:
df_data.head()

In [None]:
df_data.to_csv('newmodifieddataset.csv')

In [None]:
df_data

In [None]:
df_data=pd.read_csv("/kaggle/input/kidney-modified-dataset/Kidney_dataset_Modified.csv").iloc[:, 1:]

# Apply EM technique

In [None]:
pip install impyute

In [None]:
import impyute as impy

In [None]:
imputeddata=impy.em(df_data.values, loops=1000)

In [None]:
df_data=pd.DataFrame(imputeddata) 

In [None]:
#df_data=df_data.fillna(0)
#df_data.to_csv("newwdata_fi10test17.csv")

In [None]:
df_data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'anemia', 'class']

In [None]:
df_data

In [None]:
df_data.info()

<div style="border-radius: 10px; border: #ff001c solid; padding: 15px; margin-top: 15px; background-color: #ffffff00; font-size: 
            100%; text-align: left;">
    <b> 🚩 All columns assigned to numeric</b>
</div>

# Modeling 🪄

## Splitting Dataset

In [None]:
# Define Class as Target Variable, and the rest as feature variable
X = df_data.drop("class", axis=1)     # everything except 'class' column
y = df_data['class']

# Define the train dataset as 70% and test dataset as 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

# Confirm that the records returned for Train is about 70% and Test is about 30%
print(f"'X' shape: {X_train.shape}")
print(f"'y' shape: {X_test.shape}")

# Apply Smote to balance the samples 

In [None]:
!pip install -U imbalanced-learn

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(Y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(Y_train == 0)))

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, Y_train = oversample.fit_resample(X_train, Y_train)

In [None]:
print('After OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(Y_train.shape))

In [None]:
print("After OverSampling, counts of label '1': {}".format(sum(Y_train == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(Y_train == 0)))

In [None]:
!pip install wget 

import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')

import numpy as np
import pandas as pd
from utils import *
import torch
import seaborn as sns

import pandas as pd
import missingno as msno
har = df_data
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}




orig_dataset=df_data.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.99, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 
print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_data["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc5 = DecisionTreeClassifier()
clf_dtc5.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc5, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i



y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)




## Training Models

In [None]:
# Random Forest
clf_rand_forest = RandomForestClassifier()
clf_rand_forest.fit(X_train, Y_train)

# SVM
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(X_train, Y_train)

# KNN
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(X_train, Y_train)

# Decision Tree
clf_dtc = DecisionTreeClassifier()
clf_dtc.fit(X_train, Y_train)


## Model evaluation

In [None]:
# Printing of Model Evaluation Report
def print_std_model_evaulation_rpt(Y_test, Y_pred):
    print(classification_report(Y_test, Y_pred))
    print(f"mean_absolute_error :- {mean_absolute_error(Y_test,Y_pred)}")
    print(f"mean_absolute_error :- {mean_squared_error(Y_test,Y_pred, squared=False)}")
    cm1 = confusion_matrix(Y_test, Y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm1,display_labels=clf_rand_forest.classes_)
    disp.plot()
    plt.show()
    


### Random Forest

#### Classification Report

In [None]:
# Random Forest
Y_pred = clf_rand_forest.predict(X_test)
rand_forest_acc = accuracy_score(Y_test, Y_pred)
print_std_model_evaulation_rpt(Y_test, Y_pred)


print(rand_forest_acc)



In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
import sklearn.metrics as metrics
#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, Y_pred)

### SVM

#### Classification Report

In [None]:
from sklearn.svm import SVC
X = df_data.drop("class", axis=1)     # everything except 'class' column
y = df_data['class']
# Define the train dataset as 70% and test dataset as 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state = 1)

clf_svm = svm.SVC(kernel='linear') #0.97
# clf_svm = SVC(gamma='auto') # 0.59
# clf_svm = SVC(kernel='poly') # 0.59
# clf_svm = SVC(kernel='rbf',gamma=0.01) # 0.59
# clf_svm.fit(X_test,Y_test)
clf_svm.fit(X_train, Y_train)

Y_pred = clf_svm.predict(X_test)
svm_acc = accuracy_score(Y_test, Y_pred)
print_std_model_evaulation_rpt(Y_test, Y_pred)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
import sklearn.metrics as metrics
#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, Y_pred)

In [None]:
### KNN
clf_knn = KNeighborsClassifier(n_neighbors=5)
clf_knn.fit(X_train, Y_train)
Y_pred = clf_knn.predict(X_test)
knn_acc = accuracy_score(Y_test, Y_pred)
print_std_model_evaulation_rpt(Y_test, Y_pred)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
import sklearn.metrics as metrics
#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, Y_pred)

In [None]:
### Decision Tree
Y_pred = clf_dtc.predict(X_test)
dtc_acc = accuracy_score(Y_test, Y_pred)
print_std_model_evaulation_rpt(Y_test, Y_pred)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
import sklearn.metrics as metrics
#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, Y_pred)

In [None]:
models = pd.DataFrame({
    'Model' : [ 'Random Forest Classifier', 'SVM Classifier', 'KNN Classifier', 'Decision Tree Classifier'],
    'Score' : [rand_forest_acc, svm_acc,knn_acc, dtc_acc]
})


sorted_models = models.sort_values(by = 'Score', ascending = True)

fig = px.bar(data_frame = sorted_models, x = 'Score', y = 'Model',
       title = 'Models Comparison')

fig.show()

In [None]:
pip install astor


In [None]:
pip install skompiler


In [None]:
pip install pydotplus

In [None]:
X = df_data.drop("class", axis=1)     # everything except 'class' column
y = df_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i

In [None]:
y_pred = clf_dtc.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Create a subspace classifiers 


In [None]:
df_data1=pd.read_csv("/kaggle/input/kidney-modified-dataset/Kidney_dataset_Modified.csv").iloc[:, 1:]

In [None]:
df_data1

In [None]:
df_data1=df_data1.fillna(0)

In [None]:
columns = ['age','blood_pressure','sugar','red_blood_cells ',
'pus_cell','pus_cell_clumps','bacteria','blood_glucose_random','blood_urea','potassium','white_blood_cell_count','red_blood_cell_count','hypertension','coronary_artery_disease','peda_edema','class']
data2 = pd.DataFrame(df_data1, columns=columns)
data2

In [None]:
data2=data2.fillna(0)

In [None]:
X = data2.drop("class", axis=1)     # everything except 'class' column
y = data2['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc2 = DecisionTreeClassifier()
clf_dtc2.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc2, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i

In [None]:
y_pred = clf_dtc2.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Modified Dataset -2

In [None]:
columns = ['specific_gravity','albumin','serum_creatinine','sodium',
'haemoglobin','packed_cell_volume','diabetes_mellitus','appetite','anemia','age','blood_pressure','red_blood_cells','pus_cell','pus_cell_clumps','bacteria','coronary_artery_disease','class']
data3 = pd.DataFrame(df_data1, columns=columns)
data3


In [None]:
data3=data3.fillna(0)

X = data3.drop("class", axis=1)     # everything except 'class' column
y = data3['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc3 = DecisionTreeClassifier()
clf_dtc3.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc3, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i








In [None]:
y_pred = clf_dtc3.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)


# Modified Datast 3


In [None]:
columns = ['specific_gravity','appetite','anemia','age',
'blood_pressure','red_blood_cells','pus_cell','pus_cell_clumps ','bacteria','coronary_artery_disease  ','blood_pressure','red_blood_cells','pus_cell','pus_cell_clumps','bacteria','coronary_artery_disease','class']
data4 = pd.DataFrame(df_data1, columns=columns)
data4


In [None]:
data4=data4.fillna(0)

X = data4.drop("class", axis=1)     # everything except 'class' column
y = data4['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc4 = DecisionTreeClassifier()
clf_dtc4.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc4, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT3.png')
i=Image(graph.create_png())
i

In [None]:
y_pred = clf_dtc4.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Modifed Dataset -4

In [None]:
columns = ['appetite','anemia','pus_cell_clumps ','bacteria','coronary_artery_disease','class']
data5 = pd.DataFrame(df_data1, columns=columns)
data5

In [None]:
data5=data5.fillna(0)

X = data5.drop("class", axis=1)     # everything except 'class' column
y = data5['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc5 = DecisionTreeClassifier()
clf_dtc5.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc5, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i

In [None]:
y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Save the  models


In [None]:
import pickle
#pickle.dump(clf_dtc2, open('clf_dtc2', 'wb'))
clf_dtc2 = pickle.load(open('clf_dtc2', 'rb'))
#pickled_model.predict(X_test)



In [None]:
import pickle
#pickle.dump(clf_dtc3, open('clf_dtc3', 'wb'))
clf_dtc3 = pickle.load(open('clf_dtc3', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
#pickle.dump(clf_dtc4, open('clf_dtc4', 'wb'))
clf_dtc4 = pickle.load(open('clf_dtc4', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
#pickle.dump(clf_dtc5, open('clf_dtc5', 'wb'))
clf_dtc5 = pickle.load(open('clf_dtc5', 'rb'))
#pickled_model.predict(X_test)

# Now form a Ensemble Classifier 

In [None]:
df_data1

In [None]:
orig_dataset=df_data1.drop('class',axis=1)
orig_dataset

# Generate diffe missing values 

In [None]:
!pip install wget 

import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')

import numpy as np
import pandas as pd
from utils import *
import torch
import seaborn as sns

In [None]:
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


In [None]:
def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}

In [None]:
har1_miss_mar = produce_NA(har1=har1, p_miss=0.10, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata


In [None]:
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_data1["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))


'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

# **for 20% missing values**


In [None]:
har1_miss_mar = produce_NA(har1=har1, p_miss=0.20, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_data1["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))


'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)





# for 30% 

In [None]:
har1_miss_mar = produce_NA(har1=har1, p_miss=0.30, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_data1["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))


'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)





# For 40%

In [None]:
har1_miss_mar = produce_NA(har1=har1, p_miss=0.40, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_data1["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))


'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)





# For 50%

In [None]:
har1_miss_mar = produce_NA(har1=har1, p_miss=0.50, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_data1["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))


'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)





# Evaluate the DT performance on 10 to 50% of Missing Values 


In [None]:
orig_dataset=df_data1.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.50, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_data1["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtc5 = DecisionTreeClassifier()
clf_dtc5.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtc5, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoT1.png')
i=Image(graph.create_png())
i



y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

**

# Other Classifiers - Performance 

# 1. SVC

In [None]:
orig_dataset=df_data1.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.50, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_data1["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)

clf_dtc5 = svm.SVC(kernel='linear')
clf_dtc5.fit(X_train, y_train)

#clf_dtc5 = DecisionTreeClassifier()
#clf_dtc5.fit(X_train, y_train)

feature_cols = list(X) 




y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# KNN -Classifier 

In [None]:
orig_dataset=df_data1.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.50, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_data1["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)

clf_dtc5 = KNeighborsClassifier(n_neighbors=5)
clf_dtc5.fit(X_train, y_train)

#clf_dtc5 = DecisionTreeClassifier()
#clf_dtc5.fit(X_train, y_train)

feature_cols = list(X) 




y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Random Forest 

In [None]:
orig_dataset=df_data1.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.10, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_data1["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)

clf_dtc5 = RandomForestClassifier()
clf_dtc5.fit(X_train, y_train)

#clf_dtc5 = DecisionTreeClassifier()
#clf_dtc5.fit(X_train, y_train)

feature_cols = list(X) 




y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Create aSecond Balanced Dataset for exploring the same 


In [None]:
df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


X = df_datab2.drop("class", axis=1)     # everything except 'class' column
y = df_datab2['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb21 = DecisionTreeClassifier()
clf_dtcb21.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb21, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb21.png')
i=Image(graph.create_png())
i









In [None]:
y_pred = clf_dtcb21.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Create a modified Dataset 

In [None]:
columns = ['age','blood_pressure','specific_gravity','albumin','sugar',' red_blood_cells','pus_cell','pus_cell_clumps','bacteria','blood_urea',' serum_creatinine ','potassium ','white_blood_cell_count','red_blood_cell_count ','hypertension','diabetes_mellitus ','coronary_artery_disease',' appetite ','anemia','class']
datab21 = pd.DataFrame(df_datab2, columns=columns)
datab21
datab21=datab21.fillna(0)

X = datab21.drop("class", axis=1)     # everything except 'class' column
y = datab21['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb22 = DecisionTreeClassifier()
clf_dtcb22.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb22, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb22.png')
i=Image(graph.create_png())
i

In [None]:

y_pred = clf_dtcb22.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Modified Dataset 

In [None]:
columns = ['sugar',' red_blood_cells','pus_cell','pus_cell_clumps','bacteria','blood_urea',' serum_creatinine ','potassium ','red_blood_cell_count ','diabetes_mellitus ','coronary_artery_disease',' appetite ','anemia','class']
datab22 = pd.DataFrame(df_datab2, columns=columns)
datab22
datab22=datab22.fillna(0)

X = datab22.drop("class", axis=1)     # everything except 'class' column
y = datab22['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb23 = DecisionTreeClassifier()
clf_dtcb23.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb23, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb23.png')
i=Image(graph.create_png())
i









In [None]:
y_pred = clf_dtcb23.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Modified Dataset 


In [None]:
columns = [' red_blood_cells','bacteria',' serum_creatinine ',
           'potassium ','red_blood_cell_count ','diabetes_mellitus ',
           'coronary_artery_disease',' appetite ','class']
datab23 = pd.DataFrame(df_datab2, columns=columns)
datab23
datab23=datab23.fillna(0)

X = datab23.drop("class", axis=1)     # everything except 'class' column
y = datab23['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb24 = DecisionTreeClassifier()
clf_dtcb24.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb24, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb23.png')
i=Image(graph.create_png())
i

In [None]:
y_pred = clf_dtcb24.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)


# Modifed dataset 


In [None]:
columns = ['red_blood_cells',' serum_creatinine ','potassium','red_blood_cell_count ','diabetes_mellitus ', 'appetite ','class']
datab24 = pd.DataFrame(df_datab2, columns=columns)
datab24
datab24=datab24.fillna(0)

X = datab24.drop("class", axis=1)     # everything except 'class' column
y = datab24['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb25 = DecisionTreeClassifier()
clf_dtcb25.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb25, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb23.png')
i=Image(graph.create_png())
i


In [None]:
y_pred = clf_dtcb25.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)

# Modified Dataset 

In [None]:
columns = ['serum_creatinine ','red_blood_cell_count ','diabetes_mellitus ','appetite ','class']
datab25 = pd.DataFrame(df_datab2, columns=columns)
datab25
datab25=datab25.fillna(0)

X = datab25.drop("class", axis=1)     # everything except 'class' column
y = datab25['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)
clf_dtcb26 = DecisionTreeClassifier()
clf_dtcb26.fit(X_train, y_train)
feature_cols = list(X) 
import six
import sys
sys.modules['sklearn.externals.six'] = six
#from six imporaccuracy_scoret StringIO
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dtcb26, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('ClassifierIoTb26.png')
i=Image(graph.create_png())
i


As no attributes are considered for DT construction we neglect thme 

# Save B2 Subspace classifiers 

In [None]:
import pickle
pickle.dump(clf_dtcb21, open('clf_dtcb21', 'wb'))
clf_dtcb21 = pickle.load(open('clf_dtcb21', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
pickle.dump(clf_dtcb22, open('clf_dtcb22', 'wb'))
clf_dtcb22 = pickle.load(open('clf_dtcb22', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
pickle.dump(clf_dtcb23, open('clf_dtcb23', 'wb'))
clf_dtcb23 = pickle.load(open('clf_dtcb23', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
pickle.dump(clf_dtcb24, open('clf_dtcb24', 'wb'))
clf_dtcb24 = pickle.load(open('clf_dtcb24', 'rb'))
#pickled_model.predict(X_test)

In [None]:
import pickle
pickle.dump(clf_dtcb25, open('clf_dtcb25', 'wb'))
clf_dtcb25 = pickle.load(open('clf_dtcb25', 'rb'))
#pickled_model.predict(X_test)

# Now create a Ensemble classifier for B2 dataset  and test with different percentages of missing values 

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.10, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtcb21))
estimators.append(('c2',clf_dtcb22))
estimators.append(('c3',clf_dtcb23))
estimators.append(('c4',clf_dtcb24))
estimators.append(('c5',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.20, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtcb21))
estimators.append(('c2',clf_dtcb22))
estimators.append(('c3',clf_dtcb23))
estimators.append(('c4',clf_dtcb24))
estimators.append(('c5',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.30, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtcb21))
estimators.append(('c2',clf_dtcb22))
estimators.append(('c3',clf_dtcb23))
estimators.append(('c4',clf_dtcb24))
estimators.append(('c5',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.40, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtcb21))
estimators.append(('c2',clf_dtcb22))
estimators.append(('c3',clf_dtcb23))
estimators.append(('c4',clf_dtcb24))
estimators.append(('c5',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/b2-dataset/B2-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.50, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtcb21))
estimators.append(('c2',clf_dtcb22))
estimators.append(('c3',clf_dtcb23))
estimators.append(('c4',clf_dtcb24))
estimators.append(('c5',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

# Now Lets combine both the balanced datasets EC's and make EC of EC and test with different percentages of missing values 


In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/actual-after-preprocesing-datast/actual-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()


har = df_datab2
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1


def produce_NA(har1, p_miss, mecha="MAR", opt="logistic", p_obs=None, q=None):
    to_torch = torch.is_tensor(har1) ## output a pytorch tensor, or a numpy array
    if not to_torch:
        har1 = har1.astype(np.float32)
        har1 = torch.from_numpy(har1)
    
    if mecha == "MAR" and opt == "logistic":
        mask = MAR_mask(har1, p_miss, p_obs).double()
        print("mask values")
        print(mask)
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(har1, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(har1, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(har1, p_miss).double()
    else:
        mask = (torch.rand(har1.shape) < p_miss).double()
     
    
    har1_nas = har1.clone()
    har1_nas[mask.bool()] = np.nan
    
    
    
    
    return {'har1_init': har1.double(), 'har1_incomp': har1_nas.double(), 'mask': mask}



har1_miss_mar = produce_NA(har1=har1, p_miss=0.40, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil100.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata



from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar

X = Edata10Mar 
y= df_datab2["class"] 
X
feature_cols = list(X) 
y
# split data
X_train, X_test, Y_train, Y_test = train_test_split(X, y)





estimators = []
estimators.append(('c1',clf_dtc2))
estimators.append(('c2',clf_dtc3))
estimators.append(('c3',clf_dtc4))
estimators.append(('c4',clf_dtc5))
estimators.append(('c5',clf_dtcb21))
estimators.append(('c6',clf_dtcb22))
estimators.append(('c7',clf_dtcb23))
estimators.append(('c8',clf_dtcb24))
estimators.append(('c9',clf_dtcb25))



'''voting = VotingClassifier(estimators=estimators,voting='hard')
vot_hard=voting.fit(X_train, Y_train)

y_pred=vot_hard.predict(X_test)
print(y_pred)
score=accuracy_score(Y_test,y_pred)
#print("hard voting score %d" % score)
print(score)'''



# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimators, voting ='soft')
vot_soft.fit(X_train, Y_train)
y_pred = vot_soft.predict(X_test)
print(y_pred)
# using accuracy_score
sscore = accuracy_score(Y_test, y_pred)
#print("Soft Voting Score % d" % score)
print(sscore)

from sklearn.metrics import classification_report

#print(classification_report(Y_test, y_pred))
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(Y_test, y_pred)

# Now apply other classifiers on the actual data 

In [None]:
import pandas as pd
import missingno as msno

df_datab2=pd.read_csv("/kaggle/input/actual-after-preprocesing-datast/actual-newmodifieddataset.csv").iloc[:, 1:]

df_datab2=df_datab2.fillna(0)

df_datab2.info()

orig_dataset=df_datab2.drop('class',axis=1)
orig_dataset
import pandas as pd
import missingno as msno
har = orig_dataset
#har1=har.iloc[:,:-1]
har1=har
#msno.heatmap(har1)
#msno.dendrogram(har1)
har1=har1.values
#print('Shape Test:\t{}\n'.format(har1.shape))
har1


har1=har1.astype(float)
har1

har1_miss_mar = produce_NA(har1=har1, p_miss=0.10, mecha="MAR", p_obs=0.5)
 


#print(har1_mar) 
print(har1_miss_mar['har1_init'])
har1_mar = har1_miss_mar['har1_incomp']
print(har1_mar)  
#har1_mar = pd.DataFrame.from_dict(har1_mar)
R_mcar = har1_miss_mar['mask']
print(R_mcar)
 

print("Percentage of generated missing values: ", (R_mcar.sum()).numpy()/np.prod(R_mcar.size())*100, " %")

print(har1_mar)
print('Shape Test:\t{}\n'.format(har1_mar.shape))

import torch
import pandas as pd
import numpy as np

 
x_np = har1_mar.numpy()
x_df = pd.DataFrame(x_np)
x_df.to_csv('10mar_test.csv')

from warnings import filterwarnings
filterwarnings("ignore")

tdf = pd.read_csv("./10mar_test.csv").iloc[:, 1:]
tdf.head()

tdf.shape
# Get null values and dataframe information  check for any missing data in data sets
print('Null Values In DataFrame: {}\n'.format(tdf.isna().sum().sum()))
tdf.info()
tdf=tdf.fillna(0)
tdf.to_csv("data_fil10.csv")

tdf



testdata = pd.read_csv('./10mar_test.csv').iloc[:, 1:] 
 
testdata.head(10)
#testdata=testdata.drop(['16'],axis=1)
testdata

print('Null Values In DataFrame: {}\n'.format(testdata.isna().sum().sum()))
testdata.isnull().sum(0)

testdata=testdata.fillna(0)
testdata
from sklearn.model_selection import train_test_split
#kfold =sklearn.model_selection.KFold(n_splits=10)

import pandas as pd
 
from sklearn.metrics import log_loss
#seed=7
#

# importing voting classifier
from sklearn.ensemble import VotingClassifier
 
    
#Edata10Mar = supp10_data2
Edata10Mar =testdata 
Edata10Mar
X = Edata10Mar  
y= df_datab2["class"] 
X
feature_cols = list(X) 
y


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = None)

clf_dtc5 = DecisionTreeClassifier()
clf_dtc5.fit(X_train, y_train)

#clf_dtc5 = DecisionTreeClassifier()
#clf_dtc5.fit(X_train, y_train)

feature_cols = list(X) 




y_pred = clf_dtc5.predict(X_test)
print(y_pred)
def Model_Performance(test,pred):
    #precision = precision_score(test,pred)
    precision = precision_score(test,pred,average='macro')
    recall = recall_score(test,pred,average='macro')
    f1 = f1_score(test,pred,average='macro')
    #print('1. Confusion Matrix:\n',confusion_matrix(test, pred))
    print("\n2. Accuracy Score:", round(accuracy_score(test, pred)*100,2),"%")
    print("3. Precision:", round(precision*100,2),"%")
    print("4. Recall:",round(recall*100,2),"%" )
    print("5. F1 Score:",round(f1*100,2),"%" )
    print("6. clasification report:\n",classification_report(test, pred))
Model_Performance(y_test, y_pred)