In [102]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

In [157]:
def preprocess(path):
  df = pd.read_csv(path)
  col_to_remove = []
  for col in df.columns:
    if col.startswith('Unnamed:') or col=='index': 
      col_to_remove.append(col)
  df.drop(col_to_remove,axis=1, inplace=True)
  df=df.replace('?',np.nan)
  df=df.replace('unknown',np.nan)
  for col in df.columns:
    if df[col].dtype==object and 'Smokes' in list(df.columns):
      df[col]=df[col].astype(float)
  return df

In [158]:
def handleEmptyValues(df,choice):
  print('Shape before Null removal =',df.shape)
  if choice == '1':
    remove=[]
    lst=list(df.isnull().sum(axis=1))
    for i in range(len(lst)):
      if lst[i]>=14:
        remove.append(i)
    df.drop(labels=remove, axis=0, inplace=True)
  elif choice =='3':
    remove=[]
    lst=list(df.isnull().sum(axis=1))
    for i in range(len(lst)):
      if lst[i]>=5:
        remove.append(i)
    df.drop(labels=remove, axis=0, inplace=True)
  print('Shape after Null removal =',df.shape)
  return df 

In [225]:
def transform(train, test, choice):
  if choice == '3':

    test['job'].fillna(train['job'].mode()[0],inplace =True)
    test['marital'].fillna(train['marital'].mode()[0],inplace =True)
    test['education'].fillna(train['education'].mode()[0],inplace =True)
    test['housing'].fillna(train['housing'].mode()[0],inplace =True)
    test['loan'].fillna(train['loan'].mode()[0],inplace =True)
    test['default'].fillna(train['default'].mode()[0],inplace =True)

    train['job'].fillna(train['job'].mode()[0],inplace =True)
    train['marital'].fillna(train['marital'].mode()[0],inplace =True)
    train['education'].fillna(train['education'].mode()[0],inplace =True)
    train['housing'].fillna(train['housing'].mode()[0],inplace =True)
    train['loan'].fillna(train['loan'].mode()[0],inplace =True)
    train['default'].fillna(train['default'].mode()[0],inplace =True)

    print('Press 1 for One Hot Encoding of Categorical Features')
    print('Press 2 for Simple Conversion of Categorical Features to Numerical Data')
    ch=int(input())
    if ch==1:
      oModel=OneHotEncoder(handle_unknown='ignore')
      mModel=MinMaxScaler()
      tModel = make_column_transformer((oModel, ['job', 'marital', 'education','default','housing','loan','contact','month','day_of_week','poutcome']),remainder='passthrough')
      train = pd.DataFrame(tModel.fit_transform(train), columns=tModel.get_feature_names_out())
      test = pd.DataFrame(tModel.transform(test), columns=tModel.get_feature_names_out())
      columnsList=list(train.columns)
      model=MinMaxScaler()
      train = model.fit_transform(train)
      test = model.transform(test)
      return pd.DataFrame(train,columns = columnsList),pd.DataFrame(test,columns = columnsList)
    else:
      mModel=MinMaxScaler()
      tModel = make_column_transformer((mModel, ['age', 'duration', 'campaign','pdays','previous','cons_price_idx','cons_conf_idx','nr_employed']),remainder='passthrough')
      columnsList=['age', 'duration', 'campaign','pdays','previous','cons_price_idx','cons_conf_idx','nr_employed']+ ['job', 'marital', 'education','default','housing','loan','contact','month','day_of_week','poutcome','y']
      train = pd.DataFrame(tModel.fit_transform(train), columns=columnsList)
      test = pd.DataFrame(tModel.transform(test), columns=columnsList)
      colChange=['job', 'marital', 'education','default','housing','loan','contact','month','day_of_week','poutcome']
      for col in colChange:
        temp_dict = {key: id for id, key in enumerate(list(train[col].unique()))}
        train[col] = [temp_dict[item] for item in train[col]]
        test[col] = [temp_dict[item] for item in test[col]]
      return train,test
    return train, test
  print('Press 1 to Fill Null by Median')
  print('Press 2 to Fill Null by Mean')
  ch=int(input())
  if ch==1:
    test.fillna(train.median(),inplace=True)
    train.fillna(train.median(),inplace=True)
  else:
    test.fillna(train.mean(),inplace=True)
    train.fillna(train.mean(),inplace=True)
  columnsList=list(train.columns)
  model=MinMaxScaler()
  train = model.fit_transform(train)
  test = model.transform(test)
  train=pd.DataFrame(train,columns = columnsList)
  test=pd.DataFrame(test,columns = columnsList)
  if 'fetal_health' in train.columns:
    train['fetal_health']=train['fetal_health'].replace(1.0,3)
    train['fetal_health']=train['fetal_health'].replace(0.0,1)
    train['fetal_health']=train['fetal_health'].replace(0.5,2)
    test['fetal_health']=test['fetal_health'].replace(1.0,3)
    test['fetal_health']=test['fetal_health'].replace(0.0,1)
    test['fetal_health']=test['fetal_health'].replace(0.5,2)

  return train,test

In [160]:
def removeColumns(df,choice):
  print('Shape before column removal =',df.shape)
  if choice=='1':
    df.drop(['STDs','STDs (number)','STDs: Time since first diagnosis','STDs: Time since last diagnosis','Dx:HPV','STDs:AIDS','STDs:AIDS','STDs:cervical condylomatosis','STDs:vulvo-perineal condylomatosis'],axis=1, inplace=True)
  elif choice=='2':
    df.drop(['histogram_mean','histogram_median'],axis=1, inplace=True)
  else:
    df.drop(['emp_var_rate','euribor3m'],axis=1, inplace=True)
  print('Shape after column removal =',df.shape)
  return df  

In [161]:
def convertFormat(df,choice):
  categoricalFeatures=[]
  for col in df.columns:
    if df[col].dtype==object:
      categoricalFeatures.append(col)
  return df,categoricalFeatures

In [162]:
def outlierRemoval(df,choice):
  print('Shape before outlier removal =',df.shape)
  if choice=='1':
    df.drop(df.loc[(df['Age']> 55)].index,inplace=True)
  elif choice=='2':
    df.drop(df.loc[(df['baseline value']<120) & (df['fetal_health']==3)].index,inplace=True)
    df.drop(df.loc[(df['fetal_movement']>0.1) & (df['fetal_health']==2)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_short_term_variability']>5)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_long_term_variability']>25) & (df['fetal_health']==2)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_long_term_variability']>30)].index,inplace=True)
    df.drop(df.loc[(df['histogram_max']>220)].index,inplace=True)
    df.drop(df.loc[(df['histogram_number_of_zeroes']>6)].index,inplace=True)
  else:
    df.drop(df.loc[(df['campaign']> 22) & (df['y']==1 )].index,inplace=True)
    df.drop(df.loc[df['age']> 95].index,inplace=True)
    df.drop(df.loc[df['duration']>= 2500].index,inplace=True)
    df.drop(df.loc[(df['euribor3m']>3.0) & (df['y']==1) & (df['euribor3m']<4)].index,inplace=True)
    df.drop(df.loc[(df['campaign']> 35)].index,inplace=True)
  print('Shape after outlier removal =',df.shape)
  return df

In [163]:
def trainTestSplit(df):
  train=df.sample(frac=0.8,random_state=86)
  test=df.drop(train.index)
  return train,test

In [206]:
from sklearn.metrics.pairwise import DataConversionWarning
def run():
  print('Press 1 for Cancer Detection Dataset')
  print('Press 2 for Fetal Health Detection Dataset')
  print('Press 3 for Banking Dataset')
  datasetChoice = input()
  path = 'dataset'+datasetChoice+'.csv'
  # path='./../data/dataset'+datasetChoice+'.csv'
  data = preprocess(path)
  data, categoricalFeatures = convertFormat(data,datasetChoice)
  data = outlierRemoval(data,datasetChoice)
  data = removeColumns(data,datasetChoice)
  data = handleEmptyValues(data,datasetChoice)
  train,test=trainTestSplit(data)
  train,test=transform(train,test,datasetChoice)
  return data,train,test
  # train,val,test = dataSplit()
  

In [226]:
data,train,test=run()

Press 1 for Cancer Detection Dataset
Press 2 for Fetal Health Detection Dataset
Press 3 for Banking Dataset
3
Shape before outlier removal = (41188, 21)
Shape after outlier removal = (41150, 21)
Shape before column removal = (41150, 21)
Shape after column removal = (41150, 19)
Shape before Null removal = (41150, 19)
Shape after Null removal = (41148, 19)
Press 1 for One Hot Encoding of Categorical Features
Press 2 for Simple Conversion of Categorical Features to Numerical Data
1


In [227]:
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,cons_price_idx,cons_conf_idx,nr_employed,y
0,44,blue-collar,married,basic.4y,,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,93.444,-36.1,5228.1,0
1,53,technician,married,,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,93.200,-42.0,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,94.055,-39.8,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,185,2,999,0,nonexistent,93.075,-47.1,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,137,1,3,1,success,92.201,-31.4,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,,no,yes,telephone,jun,thu,222,1,999,0,nonexistent,94.465,-41.8,5228.1,0
41184,31,housemaid,married,basic.4y,,no,no,telephone,may,thu,196,2,999,0,nonexistent,93.994,-36.4,5191.0,0
41185,42,admin.,single,university.degree,,yes,yes,telephone,may,wed,62,3,999,0,nonexistent,93.994,-36.4,5191.0,0
41186,48,technician,married,professional.course,no,no,yes,telephone,oct,tue,200,2,999,0,nonexistent,92.431,-26.9,5017.5,0


In [228]:
train

Unnamed: 0,onehotencoder__job_admin.,onehotencoder__job_blue-collar,onehotencoder__job_entrepreneur,onehotencoder__job_housemaid,onehotencoder__job_management,onehotencoder__job_retired,onehotencoder__job_self-employed,onehotencoder__job_services,onehotencoder__job_student,onehotencoder__job_technician,...,onehotencoder__poutcome_success,remainder__age,remainder__duration,remainder__campaign,remainder__pdays,remainder__previous,remainder__cons_price_idx,remainder__cons_conf_idx,remainder__nr_employed,remainder__y
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.389610,0.038616,0.029412,1.0,0.000000,0.389322,0.368201,0.877883,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.220779,0.523733,0.058824,1.0,0.000000,0.269680,0.192469,0.512287,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.246753,0.472245,0.000000,1.0,0.000000,0.340608,0.154812,0.512287,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.545455,0.055511,0.029412,1.0,0.000000,0.484412,0.615063,1.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.220779,0.106999,0.000000,1.0,0.000000,0.250195,0.033473,0.512287,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32913,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.545455,0.007643,0.000000,1.0,0.000000,0.622369,0.435146,0.877883,0.0
32914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.246753,0.122285,0.088235,1.0,0.000000,0.484412,0.615063,1.000000,0.0
32915,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.181818,0.008850,0.176471,1.0,0.142857,0.269680,0.192469,0.512287,0.0
32916,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.376623,0.037007,0.000000,1.0,0.000000,0.269680,0.192469,0.512287,0.0


In [210]:
test

Unnamed: 0,age,duration,campaign,pdays,previous,cons_price_idx,cons_conf_idx,nr_employed,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,0.350649,0.084473,0.0,1.0,0.0,0.484412,0.615063,1.0,1,1,4,0,1,1,0,3,2,0,0
1,0.168831,0.027353,0.205882,1.0,0.0,0.669135,0.338912,1.0,3,0,4,0,1,1,0,5,0,0,0
2,0.285714,0.07683,0.0,1.0,0.0,0.26968,0.192469,0.512287,1,0,3,0,1,1,0,1,3,0,0
3,0.493506,0.149638,0.058824,1.0,0.0,0.484412,0.615063,1.0,3,1,0,0,0,1,0,3,4,0,1
4,0.116883,0.4107,0.0,1.0,0.0,0.882307,0.376569,1.0,0,1,5,0,0,0,1,6,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8225,0.181818,0.021319,0.058824,1.0,0.142857,0.174591,0.866109,0.203781,0,2,5,0,0,1,1,0,3,1,0
8226,0.337662,0.362832,0.029412,1.0,0.0,0.340608,0.154812,0.512287,7,1,5,0,1,1,0,2,1,0,0
8227,0.090909,0.047466,0.088235,1.0,0.142857,0.26968,0.192469,0.512287,0,1,5,0,1,1,0,1,2,1,0
8228,0.545455,0.0893,0.0,1.0,0.0,0.882307,0.376569,1.0,6,1,5,0,0,0,1,6,2,0,0


In [147]:
test

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_variance,histogram_tendency,fetal_health
0,0.444444,0.315789,0.848233,0.266667,0.333333,0.0,0.2,0.120000,0.437500,0.0,0.269625,0.654088,0.155963,0.571429,0.388889,0.0,0.653543,0.299213,0.5,0.0
1,0.444444,0.263158,0.975052,0.333333,0.266667,0.0,0.2,0.226667,0.312500,0.0,0.266212,0.685535,0.137615,0.604396,0.333333,0.2,0.574803,0.106299,0.5,0.0
2,0.407407,0.263158,0.883576,0.200000,0.200000,0.0,0.4,0.186667,0.312500,0.0,0.228669,0.867925,0.064220,0.835165,0.500000,0.0,0.543307,0.133858,0.5,0.0
3,0.148148,0.263158,0.000000,0.333333,0.200000,0.0,0.0,0.160000,0.625000,0.0,0.552901,0.823899,0.018349,0.703297,0.444444,0.0,0.448819,0.074803,0.5,0.0
4,0.296296,0.000000,0.037422,0.200000,0.333333,0.0,0.0,0.133333,0.875000,0.0,0.518771,0.924528,0.000000,0.857143,0.388889,0.0,0.464567,0.094488,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,0.500000,0.000000,0.000000,0.466667,0.066667,0.0,0.6,0.640000,0.625000,0.0,0.000000,0.641509,0.091743,0.472527,0.333333,0.0,0.259843,0.165354,0.5,1.0
412,0.500000,0.000000,0.000000,0.666667,0.200000,0.0,0.4,0.640000,0.583333,0.0,0.000000,0.635220,0.073394,0.439560,0.277778,0.0,0.511811,0.354331,0.5,1.0
413,0.500000,0.000000,0.000000,0.666667,0.200000,0.0,0.4,0.653333,0.541667,0.0,0.000000,0.641509,0.064220,0.439560,0.333333,0.0,0.511811,0.374016,0.5,1.0
414,0.407407,0.000000,0.004158,0.466667,0.333333,0.0,0.4,0.706667,0.500000,0.0,0.000000,0.792453,0.119266,0.769231,0.333333,0.0,0.338583,0.110236,0.5,1.0
