In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [65]:
def preprocess(path):
  df = pd.read_csv(path)
  col_to_remove = []
  for col in df.columns:
    if col.startswith('Unnamed:') or col=='index': 
      col_to_remove.append(col)
  df.drop(col_to_remove,axis=1, inplace=True)
  df=df.replace('?',np.nan)
  df=df.replace('unknown',np.nan)
  for col in df.columns:
    if df[col].dtype==object and 'Smokes' in list(df.columns):
      df[col]=df[col].astype(float)
  return df

In [90]:
def processCategorical(df,choice):
  if choice == '3':
    df['job'].fillna(df['job'].mode()[0],inplace =True)
    df['marital'].fillna(df['marital'].mode()[0],inplace =True)
    df['education'].fillna(df['education'].mode()[0],inplace =True)
    df['housing'].fillna(df['housing'].mode()[0],inplace =True)
    df['loan'].fillna(df['loan'].mode()[0],inplace =True)
    df['default'].fillna(df['default'].mode()[0],inplace =True)
  return df

In [57]:
def handleEmptyValues(df):
  lst=list(df.isnull().sum(axis=1))
  for i in range(len(lst)):
    if lst[i]>=14:
      print(i,'he;;p')

In [106]:
def normalize(df):
  print(df.dtypes)
  return  StandardScaler().fit_transform(df)

In [59]:
def removeColumns(df,choice):
  print('Shape before column removal =',df.shape)
  if choice=='1':
    df.drop(['STDs','STDs (number)','STDs: Time since first diagnosis','STDs: Time since last diagnosis','Dx:HPV','STDs:AIDS','STDs:AIDS','STDs:cervical condylomatosis','STDs:vulvo-perineal condylomatosis'],axis=1, inplace=True)
  elif choice=='2':
    df.drop(['histogram_mean','histogram_median'],axis=1, inplace=True)
  else:
    df.drop(['emp_var_rate','euribor3m'],axis=1, inplace=True)
  print('Shape after column removal =',df.shape)
  return df  

In [60]:
def convertFormat(df,choice):
  categoricalFeatures=[]
  for col in df.columns:
    if df[col].dtype==object:
      categoricalFeatures.append(col)
  return df,categoricalFeatures

In [61]:
def outlierRemoval(df,choice):
  print('Shape before outlier removal =',df.shape)
  if choice=='1':
    df.drop(df.loc[(df['Age']> 55)].index,inplace=True)
  elif choice=='2':
    df.drop(df.loc[(df['baseline value']<120) & (df['fetal_health']==3)].index,inplace=True)
    df.drop(df.loc[(df['fetal_movement']>0.1) & (df['fetal_health']==2)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_short_term_variability']>5)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_long_term_variability']>25) & (df['fetal_health']==2)].index,inplace=True)
    df.drop(df.loc[(df['mean_value_of_long_term_variability']>30)].index,inplace=True)
    df.drop(df.loc[(df['histogram_max']>220)].index,inplace=True)
    df.drop(df.loc[(df['histogram_number_of_zeroes']>6)].index,inplace=True)
  else:
    df.drop(df.loc[(df['campaign']> 22) & (df['y']==1 )].index,inplace=True)
    df.drop(df.loc[df['age']> 95].index,inplace=True)
    df.drop(df.loc[df['duration']>= 2500].index,inplace=True)
    df.drop(df.loc[(df['euribor3m']>3.0) & (df['y']==1) & (df['euribor3m']<4)].index,inplace=True)
    df.drop(df.loc[(df['campaign']> 35)].index,inplace=True)
  print('Shape after outlier removal =',df.shape)
  return df

In [62]:
def trainTestSplit(df):
  train=df.sample(frac=0.8,random_state=86)
  test=df.drop(train.index)
  return train,test

In [115]:
from sklearn.metrics.pairwise import DataConversionWarning
def run():
  print('Press 1 for Cancer Detection Dataset')
  print('Press 2 for Fetal Health Detection Dataset')
  print('Press 3 for Banking Dataset')
  datasetChoice = input()
  path = 'dataset'+datasetChoice+'.csv'
  # path='./../data/dataset'+datasetChoice+'.csv'
  data = preprocess(path)
  data, categoricalFeatures = convertFormat(data,datasetChoice)
  data = outlierRemoval(data,datasetChoice)
  data = removeColumns(data,datasetChoice)
  handleEmptyValues(data)
  data = processCategorical(data,datasetChoice)
  # data = normalize(data)
  train,test=trainTestSplit(data)
  return data
  # train,val,test = dataSplit()
  

In [116]:
df=run()

Press 1 for Cancer Detection Dataset
Press 2 for Fetal Health Detection Dataset
Press 3 for Banking Dataset
2
Shape before outlier removal = (2126, 22)
Shape after outlier removal = (2080, 22)
Shape before column removal = (2080, 22)
Shape after column removal = (2080, 20)


In [99]:
df

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_variance,histogram_tendency,fetal_health
0,132,0.006,0.000,0.006,0.003,0.0,0.000,17,2.1,0,10.4,130,68,198,6,1,141,12,0,1
1,133,0.003,0.000,0.008,0.003,0.0,0.000,16,2.1,0,13.4,130,68,198,5,1,141,13,0,1
2,134,0.003,0.000,0.008,0.003,0.0,0.000,16,2.4,0,23.0,117,53,170,11,0,137,13,1,1
3,132,0.007,0.000,0.008,0.000,0.0,0.000,16,2.4,0,19.9,117,53,170,9,0,137,11,1,1
4,131,0.005,0.072,0.008,0.003,0.0,0.000,28,1.4,0,12.9,66,88,154,5,0,135,7,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,128,0.000,0.002,0.007,0.005,0.0,0.002,65,2.6,0,0.0,129,63,192,6,0,103,28,0,3
2122,128,0.000,0.002,0.007,0.005,0.0,0.002,66,2.7,0,0.0,129,63,192,4,0,105,21,0,3
2123,128,0.000,0.002,0.007,0.006,0.0,0.002,65,2.5,0,0.0,129,63,192,6,0,104,28,0,3
2124,128,0.000,0.003,0.007,0.006,0.0,0.002,65,2.5,0,0.0,129,63,192,5,0,114,25,0,3


In [102]:
d1=pd.read_csv('dataset2.csv')

In [103]:
d1

Unnamed: 0.1,Unnamed: 0,index,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,0,0,132,0.006,0.000,0.006,0.003,0.0,0.000,17,...,68,198,6,1,141,136,140,12,0,1
1,1,1,133,0.003,0.000,0.008,0.003,0.0,0.000,16,...,68,198,5,1,141,135,138,13,0,1
2,2,2,134,0.003,0.000,0.008,0.003,0.0,0.000,16,...,53,170,11,0,137,134,137,13,1,1
3,3,3,132,0.007,0.000,0.008,0.000,0.0,0.000,16,...,53,170,9,0,137,136,138,11,1,1
4,4,4,131,0.005,0.072,0.008,0.003,0.0,0.000,28,...,88,154,5,0,135,134,137,7,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,2121,671,128,0.000,0.002,0.007,0.005,0.0,0.002,65,...,63,192,6,0,103,92,114,28,0,3
2122,2122,672,128,0.000,0.002,0.007,0.005,0.0,0.002,66,...,63,192,4,0,105,89,112,21,0,3
2123,2123,673,128,0.000,0.002,0.007,0.006,0.0,0.002,65,...,63,192,6,0,104,94,115,28,0,3
2124,2124,674,128,0.000,0.003,0.007,0.006,0.0,0.002,65,...,63,192,5,0,114,97,116,25,0,3
