## Pre Processing Procedure of adult dataset

In [117]:
#Importing necessary packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


### Dataframe Creation

In [118]:
# Reading the data files
with open('dataset/adult.data','r') as adult_data:
    row_data_array = adult_data.readlines()

# Creating an array to store the data by rows in order to change it to differnt columns
data_rows = []
#Arranging the data columns doing necessary splittings
for data_row in row_data_array:
    data_rows.append(data_row)

# Splitting the dataset to necessary columns
dataset_array = []
for data_row in data_rows:
    dataset_array.append(data_row.split(','))

In [119]:
# Creating the dataset_array to a pandas dataframe in order to work in future
adult_dataset = pd.DataFrame(dataset_array)

adult_dataset.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K\n
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K\n
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K\n
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K\n
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K\n


In [120]:
# Renaming the Columns with the given names in the adult.names data file to make the dataset more readable
adult_dataset.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per_week','native-country','income-per-year']

# Preview after adding the data columns
adult_dataset.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per_week,native-country,income-per-year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K\n
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K\n
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K\n
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K\n
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K\n


In [121]:
# Saving the dataframe as a csv file for future references.
adult_dataset.to_csv('dataset/adult_dataset.csv')

### Data Cleaning

##### __Null values__

In [122]:
# General Information of the dataset
adult_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32562 entries, 0 to 32561
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32562 non-null  object
 1   workclass        32561 non-null  object
 2   fnlwgt           32561 non-null  object
 3   education        32561 non-null  object
 4   education-num    32561 non-null  object
 5   marital-status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  object
 11  capital-loss     32561 non-null  object
 12  hours-per_week   32561 non-null  object
 13  native-country   32561 non-null  object
 14  income-per-year  32561 non-null  object
dtypes: object(15)
memory usage: 3.7+ MB


 To the above observation there is no null values in the dataset. Therefore __no need to fill__ the __null values__ in this dataset. Also we can observe that all the Data type of this dataset in object type.

##### __Duplicate values__

In [123]:
## Finding duplicate values in the dataset.
print(adult_dataset[adult_dataset.duplicated()])


      age          workclass   fnlwgt      education education-num  \
4881   25            Private   308144      Bachelors            13   
5104   90            Private    52386   Some-college            10   
9171   21            Private   250051   Some-college            10   
11631  20            Private   107658   Some-college            10   
13084  25            Private   195994        1st-4th             2   
15059  21            Private   243368      Preschool             1   
17040  46            Private   173243        HS-grad             9   
18555  30            Private   144593        HS-grad             9   
18698  19            Private    97261        HS-grad             9   
21318  19            Private   138153   Some-college            10   
21490  19            Private   146679   Some-college            10   
21875  49            Private    31267        7th-8th             4   
22300  25            Private   195994        1st-4th             2   
22367  44           

There are some ___duplicated rows___ in the dataset. So let's get rid of them.

In [124]:
# Removing the duplicate rows in the dataset.
adult_dataset = adult_dataset.drop_duplicates()

# Printing the duplicates again in order to finnalized that the dataset get rid of duplicate values.
print(adult_dataset[adult_dataset.duplicated()])


Empty DataFrame
Columns: [age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per_week, native-country, income-per-year]
Index: []


Now the dataset has __no duplicate values__.

##### __Structural Errors__

In [125]:
# Checking for unique values in the dataset.
print(adult_dataset.nunique())

age                   74
workclass              9
fnlwgt             21648
education             16
education-num         16
marital-status         7
occupation            15
relationship           6
race                   5
sex                    2
capital-gain         119
capital-loss          92
hours-per_week        94
native-country        42
income-per-year        2
dtype: int64


In [126]:
# Printing each unique value in a column
for column in adult_dataset:
    print(column)
    print(adult_dataset[column].unique())
    print('\n')

age
['39' '50' '38' '53' '28' '37' '49' '52' '31' '42' '30' '23' '32' '40'
 '34' '25' '43' '54' '35' '59' '56' '19' '20' '45' '22' '48' '21' '24'
 '57' '44' '41' '29' '18' '47' '46' '36' '79' '27' '67' '33' '76' '17'
 '55' '61' '70' '64' '71' '68' '66' '51' '58' '26' '60' '90' '75' '65'
 '77' '62' '63' '80' '72' '74' '69' '73' '81' '78' '88' '82' '83' '84'
 '85' '86' '87' '\n']


workclass
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked' None]


fnlwgt
[' 77516' ' 83311' ' 215646' ... ' 84661' ' 257302' None]


education
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th' None]


education-num
[' 13' ' 9' ' 7' ' 14' ' 5' ' 10' ' 12' ' 11' ' 4' ' 16' ' 15' ' 3' ' 6'
 ' 2' ' 1' ' 8' None]


marital-status
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent

You can see that there is a newline code in age category which seems not suitable. Also in every column there is Nothing at the ends seems suspicious. Let's Check it.

In [127]:
# Checking the last row of the dataset
adult_dataset.tail(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per_week,native-country,income-per-year
32561,\n,,,,,,,,,,,,,,


In [128]:
# Dropping the last column based on index
adult_dataset = adult_dataset.drop(index=32561)

adult_dataset.tail(1)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per_week,native-country,income-per-year
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K\n


There is some question marks as unique values in work class, occupation as well as in native-country columns which seems doesn't really have meaning. So let's get rid of them.

In [129]:
# Checking the values in those columns and get rid of question marks
adult_dataset.replace(' ?', pd.NA, inplace=True)

In [130]:
# Check for unique values again to make sure that we get rid of all the unnecessary things
for column in adult_dataset:
    print(column)
    print(adult_dataset[column].unique())
    print('\n')

age
['39' '50' '38' '53' '28' '37' '49' '52' '31' '42' '30' '23' '32' '40'
 '34' '25' '43' '54' '35' '59' '56' '19' '20' '45' '22' '48' '21' '24'
 '57' '44' '41' '29' '18' '47' '46' '36' '79' '27' '67' '33' '76' '17'
 '55' '61' '70' '64' '71' '68' '66' '51' '58' '26' '60' '90' '75' '65'
 '77' '62' '63' '80' '72' '74' '69' '73' '81' '78' '88' '82' '83' '84'
 '85' '86' '87']


workclass
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 <NA> ' Self-emp-inc' ' Without-pay' ' Never-worked']


fnlwgt
[' 77516' ' 83311' ' 215646' ... ' 34066' ' 84661' ' 257302']


education
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']


education-num
[' 13' ' 9' ' 7' ' 14' ' 5' ' 10' ' 12' ' 11' ' 4' ' 16' ' 15' ' 3' ' 6'
 ' 2' ' 1' ' 8']


marital-status
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' '

There is a new line added to the every data row in income per year. Therefore get rid of them.

In [133]:
# Removing the '\n' in income-per-year column.
adult_dataset['income-per-year'] = adult_dataset['income-per-year'].str.strip('\n')

# Removing the whitespaces in the income-per-year column
adult_dataset['income-per-year'] = adult_dataset['income-per-year'].str.strip(' ')

adult_dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per_week,native-country,income-per-year
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
