In [1]:
import pandas as pd

# Loading datasets

In [2]:
dataset1 = pd.read_csv('Dataset/Dataset1.csv')
dataset1_unknown = pd.read_csv('Dataset/Dataset1_Unknown.csv')

dataset2 = pd.read_csv('Dataset/Dataset2.csv')
dataset2_unknown = pd.read_csv('Dataset/Dataset2_Unknown.csv')

dataset3 = pd.read_csv('Dataset/Dataset3.csv')
dataset3_unknown = pd.read_csv('Dataset/Dataset3_Unknown.csv')

# Dataset 1

## Concating dataset1 and dataset1_unknown

In [3]:
dataset = pd.concat([dataset1, dataset1_unknown])

In [4]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [5]:
print(f"Maximum capital gain {dataset['capital-gain'].max()}")
print(f"Maximum capital loss {dataset['capital-loss'].max()}")
print(f"Maximum hours per week {dataset['hours-per-week'].max()}")
print(f"Maximum fnlwgt {dataset['fnlwgt'].max()}")

Maximum capital gain 99999
Maximum capital loss 4356
Maximum hours per week 99
Maximum fnlwgt 1484705


## Converting numerical data to categorical data
### Age
- age <= 19: **Teenager**
- 20 <= age < 50: **Adult**
- 50 <= age < 65: **Middle Age**
- 65 <= age: **Elderly**

### Capital Gain
- capital gain < 33,333: **Low**
- 33,333 <= capital gain < 66,666: **Medium**
- 66,666 <= capital gain: **High**

### Capital Loss
- capital loss < 1,452: **Low**
- 1,452 <= capital loss < 2,904: **Medium**
- 2,904 <= capital loss: **High**

### Final Weight
- fnlwgt < 494,901: **Low**
- 494,901 <= fnlwgt < 989,802: **Medium**
- 989,802 <= fnlwgt: **High**

### Hours Per Week
- hours per week < 33: **Short**
- 33 <= hours per week < 66: **Medium**
- 66 <= hours per week: **Large**

In [6]:
age = dataset['age'].copy()
age.loc[dataset['age'] <= 19] = "Teenager"
age.loc[(20 <= dataset['age']) & (dataset['age'] < 50)] = "Adult"
age.loc[(50 <= dataset['age']) & (dataset['age'] < 65)] = "Middle Age"
age.loc[dataset['age'] >= 65] = "Elderly"
dataset['age'] = age

capital_gain = dataset['capital-gain'].copy()
capital_gain.loc[dataset['capital-gain'] < 33333] = "Low"
capital_gain.loc[(33333 <= dataset['capital-gain']) & (dataset['capital-gain'] < 66666)] = "Medium"
capital_gain.loc[dataset['capital-gain'] >= 66666] = "High"
dataset['capital-gain'] = capital_gain

capital_loss = dataset['capital-loss'].copy()
capital_loss.loc[dataset['capital-loss'] < 1452] = "Low"
capital_loss.loc[(1452 <= dataset['capital-loss']) & (dataset['capital-loss'] < 2904)] = "Medium"
capital_loss.loc[dataset['capital-loss'] >= 2904] = "High"
dataset['capital-loss'] = capital_loss

final_weight = dataset['fnlwgt'].copy()
final_weight.loc[dataset['fnlwgt'] < 494901] = "Low"
final_weight.loc[(494901 <= dataset['fnlwgt']) & (dataset['fnlwgt'] < 989802)] = "Medium"
final_weight.loc[dataset['fnlwgt'] >= 989802] = "High"
dataset['fnlwgt'] = final_weight

hours_per_week = dataset['hours-per-week'].copy()
hours_per_week.loc[dataset['hours-per-week'] < 33] = "Short"
hours_per_week.loc[(33 <= dataset['hours-per-week']) & (dataset['hours-per-week'] < 66)] = "Medium"
hours_per_week.loc[dataset['hours-per-week'] >= 66] = "Large"
dataset['hours-per-week'] = hours_per_week

## Encoding categorical data using one-hot encoder

In [7]:
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,Adult,State-gov,Low,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,Low,Low,Medium,United-States,<=50K
1,Middle Age,Self-emp-not-inc,Low,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,Low,Low,Short,United-States,<=50K
2,Middle Age,Private,Low,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,Low,Low,Medium,United-States,<=50K
3,Adult,Private,Low,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Low,Low,Medium,Cuba,<=50K
4,Adult,Private,Low,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,Low,Low,Medium,United-States,<=50K


In [8]:
categorial_features = [column for column in dataset if column != 'income']

dataset = pd.get_dummies(dataset, columns=categorial_features)

dataset = dataset[[column for column in dataset if column != 'income'] + ['income']]  # move income to the last column
dataset['income'] = pd.factorize(dataset['income'])[0]  # encode <=50k as 0 and > 50k as 1

In [9]:
dataset1 = dataset.head(dataset1.shape[0]).copy()
dataset1_unknown = dataset.tail(dataset1_unknown.shape[0]).copy()
dataset1_unknown.drop('income', axis=1, inplace=True)

In [10]:
dataset1_unknown.head()

Unnamed: 0,age_Adult,age_Elderly,age_Middle Age,age_Teenager,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
dataset1.to_csv('DatasetModified/dataset1.csv', index=False)
dataset1_unknown.to_csv('DatasetModified/dataset1_unknown.csv', index=False)

# Dataset2

In [12]:
dataset = pd.concat([dataset2, dataset2_unknown])

In [13]:
dataset.head()

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


In [14]:
categorial_features = [column for column in dataset if column != 'poisonous']

dataset = pd.get_dummies(dataset, columns=categorial_features)

dataset = dataset[[column for column in dataset
                     if column != 'poisonous'] + ['poisonous']]  # move poisonous to the last column
dataset['poisonous'] = pd.factorize(dataset['poisonous'])[0]  # convert poisonous values to classes

In [15]:
dataset2 = dataset.head(dataset2.shape[0]).copy()
dataset2_unknown = dataset.tail(dataset2_unknown.shape[0]).copy()
dataset2_unknown.drop('poisonous', axis=1, inplace=True)

In [16]:
dataset2.to_csv('DatasetModified/dataset2.csv', index=False)
dataset2_unknown.to_csv('DatasetModified/dataset2_unknown.csv', index=False)

# Dataset 3

In [17]:
dataset = pd.concat([dataset3, dataset3_unknown])

In [18]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1.0
1,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1.0
2,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1.0
3,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1.0
4,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1.0


In [19]:
print(f"Minimum trestbps {dataset['trestbps'].min()}")
print(f"Maximum trestbps {dataset['trestbps'].max()}")
print('-----------------')
print(f"Minimum cholestrol {dataset['chol'].min()}")
print(f"Maximum cholestrol {dataset['chol'].max()}")
print('-----------------')
print(f"Minimum thalach {dataset['thalach'].min()}")
print(f"Maximum thalach {dataset['thalach'].max()}")
print('-----------------')
print(f"Maximum oldpeak {dataset['oldpeak'].max()}")

Minimum trestbps 94
Maximum trestbps 200
-----------------
Minimum cholestrol 126
Maximum cholestrol 564
-----------------
Minimum thalach 71
Maximum thalach 202
-----------------
Maximum oldpeak 6.2


## Converting numerical data to categorical data
### Age
- age <= 19: **Teenager**
- 20 <= age < 50: **Adult**
- 50 <= age < 65: **Middle Age**
- 65 <= age: **Elderly**

### trestbps
- trestbps <= 129: **Low**
- 129 <= trestbps < 164: **Medium**
- 164 <= trestbps: **High**

### cholestrol
- cholestrol < 272: **Low**
- 272 <= cholestrol < 418: **Medium**
- 418 <= cholestrol: **High**

### thalach
- thalach < 114: **Low**
- 114 <= thalach < 157: **Medium**
- 157 <= thalach: **High**

### oldpeak
- oldpeak < 2: **Low**
- 2 <= oldpeak < 4: **Medium**
- 4 <= oldpeak: **High**

In [20]:
age = dataset['age'].copy()
age.loc[dataset['age'] <= 19] = "Teenager"
age.loc[(20 <= dataset['age']) & (dataset['age'] < 50)] = "Adult"
age.loc[(50 <= dataset['age']) & (dataset['age'] < 65)] = "Middle Age"
age.loc[dataset['age'] >= 65] = "Elderly"
dataset['age'] = age

trestbps = dataset['trestbps'].copy()
trestbps.loc[dataset['trestbps'] < 129] = "Low"
trestbps.loc[(129 <= dataset['trestbps']) & (dataset['trestbps'] < 164)] = "Medium"
trestbps.loc[dataset['trestbps'] >= 164] = "High"
dataset['trestbps'] = trestbps

chol = dataset['chol'].copy()
chol.loc[dataset['chol'] < 272] = "Low"
chol.loc[(272 <= dataset['chol']) & (dataset['chol'] < 418)] = "Medium"
chol.loc[dataset['chol'] >= 418] = "High"
dataset['chol'] = chol

thalach = dataset['thalach'].copy()
thalach.loc[dataset['thalach'] < 114] = "Low"
thalach.loc[(114 <= dataset['thalach']) & (dataset['thalach'] < 157)] = "Medium"
thalach.loc[dataset['thalach'] >= 157] = "High"
dataset['thalach'] = thalach

oldpeak = dataset['oldpeak'].copy()
oldpeak.loc[dataset['oldpeak'] < 33] = "Low"
oldpeak.loc[(33 <= dataset['oldpeak']) & (dataset['oldpeak'] < 66)] = "Medium"
oldpeak.loc[dataset['oldpeak'] >= 66] = "High"
dataset['oldpeak'] = oldpeak

In [21]:
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,Adult,1,2,Medium,Low,0,1,High,0,Low,0,0,2,1.0
1,Middle Age,1,1,Low,Low,0,1,High,0,Low,2,0,2,1.0
2,Middle Age,0,0,Low,Medium,0,1,High,1,Low,2,0,2,1.0
3,Middle Age,1,0,Medium,Low,0,1,Medium,0,Low,1,0,1,1.0
4,Middle Age,0,1,Medium,Medium,0,0,Medium,0,Low,1,0,2,1.0


## Encoding categorical data using one-hot encoder

In [22]:
categorial_features = [column for column in dataset if column != 'disease']

dataset = pd.get_dummies(dataset, columns=categorial_features)

dataset = dataset[[column for column in dataset if column != 'disease'] + ['disease']]  # move disease to the last column

In [23]:
dataset3 = dataset.head(dataset3.shape[0]).copy()
dataset3_unknown = dataset.tail(dataset3_unknown.shape[0]).copy()
dataset3_unknown.drop('disease', axis=1, inplace=True)

In [24]:
dataset3.to_csv('DatasetModified/dataset3.csv', index=False)
dataset3_unknown.to_csv('DatasetModified/dataset3_unknown.csv', index=False)