### Imports

In [15]:
import pandas as pd 
import numpy as np

### Reading dataset

In [16]:
df = pd.read_csv('adult.csv')

### Dropping duplicates

In [17]:
df.drop_duplicates(inplace=True)

### Dropping NaN and replacing their value

In [18]:
df.replace('?', np.nan, inplace= True)
df.dropna(subset=['occupation'], inplace=True)

native_country_mode = df['native-country'].mode()[0]
df[['native-country']] = df[['native-country']].fillna(native_country_mode)

### Fixing skewness of nummeric data

In [19]:
numeric_columns = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [20]:
from scipy.stats import yeojohnson
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
df_numeric_fixed_skew = pd.DataFrame(pt.fit_transform(df[numeric_columns]), columns=numeric_columns)

### Feature engineering

In [21]:
# America
df["native-country"] = df["native-country"].replace(['United-States'], 'US')

# Europe
df["native-country"] = df["native-country"].replace(['Greece','Holand-Netherlands','Poland',"England","Yugoslavia",
                                                        "Germany","Italy","Ireland","Hungary","France","Scotland",
                                                        "Portugal"], 'Europe')

# Asia
df["native-country"] = df["native-country"].replace(['Vietnam','China','Taiwan',"India","Philippines","Japan",
                                                        "Hong","Cambodia","Laos","Thailand"], 'Asia')

# Others
df["native-country"] = df["native-country"].replace(['Mexico','Trinadad&Tobago','Canada',"Puerto-Rico",
                                                        "Honduras","Cuba","Peru","Nicaragua","Dominican-Republic",
                                                        "Haiti","El-Salvador","Columbia","Guatemala","Jamaica","Ecuador",
                                                        "Outlying-US(Guam-USVI-etc)","Iran"], 'Others')


In [22]:
df["marital-status"] = df["marital-status"].replace(['Divorced','Separated','Widowed'], 'Single')
df["marital-status"] = df["marital-status"].replace(['Married-civ-spouse','Married-spouse-absent',
                                                         'Married-AF-spouse'], 'Married')

In [23]:
df["relationship"] = df["relationship"].replace(['Not-in-family','Other-relative'], 'Separated')
df["relationship"] = df["relationship"].replace(['Husband','Wife'], 'Married')
df["relationship"] = df["relationship"].replace(['Unmarried','Own-child'], 'Single')

In [24]:
df["race"] = df["race"].replace(['Asian-Pac-Islander','Amer-Indian-Eskimo','Other'], 'Other')

In [25]:
df["workclass"] = df["workclass"].replace(['Self-emp-not-inc','Local-gov',"State-gov","Self-emp-inc","Federal-gov",
                                              "Without-pay","Never-worked"], 'govermental')

In [26]:
df["education"] = df["education"].replace(['Prof-school',"Assoc-acdm","Assoc-voc"], 'high-school')
df["education"] = df["education"].replace(['Some-college','Doctorate','Bachelors',"Masters"], 'college')
df["education"] = df["education"].replace(['7th-8th','10th','11th',"1st-4th","5th-6th","12th",
                                              "9th","Preschool"], 'pre-hs')


### Encoding non-numerical data

In [27]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_object = df.select_dtypes(exclude=[np.number]).reset_index()
df_object.drop('index', inplace=True, axis = 1)
object_columns = [col for col in df_object.columns]

for col in df_object:
    df_object[col] = le.fit_transform(df_object[col])

### Concatenating object and numeric data

In [28]:
df_cleaned = pd.merge(df_numeric_fixed_skew, df_object, left_index=True, right_index=True)