# Data Transformations

### 1. Handling Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('adult_with_headers (1).csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# replacing ? with Nan
df = df.replace("?", pd.NA)

In [5]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [6]:
#seperating categorical and numerical columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

#imputing median for numerical data
for col in num_cols:
  df[col].fillna(df[col].median(), inplace=True)

#imputing mode for categorical columns
for col in cat_cols:
  df[col].fillna(df[col].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [7]:
# Applying Scaling techniques to numerical columns

# Standard Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030671,State-gov,-1.063611,Bachelors,1.134739,Never-married,Adm-clerical,Not-in-family,White,Male,0.148453,-0.21666,-0.035429,United-States,<=50K
1,0.837109,Self-emp-not-inc,-1.008707,Bachelors,1.134739,Married-civ-spouse,Exec-managerial,Husband,White,Male,-0.14592,-0.21666,-2.222153,United-States,<=50K
2,-0.042642,Private,0.245079,HS-grad,-0.42006,Divorced,Handlers-cleaners,Not-in-family,White,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
3,1.057047,Private,0.425801,11th,-1.197459,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,-0.14592,-0.21666,-0.035429,United-States,<=50K
4,-0.775768,Private,1.408176,Bachelors,1.134739,Married-civ-spouse,Prof-specialty,Wife,Black,Female,-0.14592,-0.21666,-0.035429,Cuba,<=50K


In [8]:
# Min Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,<=50K


Standard Scaling is preferred when:


*   Data follows an approximately normal distribution
*   The algorithm is sensitive to feature variance

It is used because:


*   Improves numerical stability and convergence of models
*   Preserves outliers

Min-Max Scaling is preferred when:



*   A fixed range [0,1] is required
*   The algorithm is distance-based

*   Data does not contain many extreme outliers

It is used because:


*   Maintains original data distribution shape
*   Ensures all features lie within the same range











### 2. Encoding Techniques

In [11]:
# counting categorical columns
cat_counts = df[cat_cols].nunique()
cat_counts

Unnamed: 0,0
workclass,9
education,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2
native_country,42
income,2


In [10]:
#selecting variables with less than 5 categories
cat_cols_5 = cat_counts[cat_counts < 5].index
cat_cols_5

Index(['sex', 'income'], dtype='object')

In [12]:
# applying one hot encoding technique
df_encoded = pd.get_dummies(df, columns=cat_cols_5, drop_first=True)
df_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_ Male,income_ >50K
0,0.30137,State-gov,0.044302,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,0.02174,0.0,0.397959,United-States,True,False
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,0.0,0.0,0.122449,United-States,True,False
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,0.0,0.0,0.397959,United-States,True,False
3,0.493151,Private,0.151068,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,0.0,0.0,0.397959,United-States,True,False
4,0.150685,Private,0.221488,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,0.0,0.0,0.397959,Cuba,False,False


In [13]:
#applying label encoding for categorical variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols:
  df[col] = le.fit_transform(df[col].astype(str))
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,7,0.044302,9,0.8,4,1,1,4,1,0.02174,0.0,0.397959,39,0
1,0.452055,6,0.048238,9,0.8,2,4,0,4,1,0.0,0.0,0.122449,39,0
2,0.287671,4,0.138113,11,0.533333,0,6,1,4,1,0.0,0.0,0.397959,39,0
3,0.493151,4,0.151068,1,0.4,2,6,0,2,1,0.0,0.0,0.397959,39,0
4,0.150685,4,0.221488,9,0.8,2,10,5,2,0,0.0,0.0,0.397959,5,0


### Data Exploration and Preprocessing

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  int64  
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  int64  
 4   education_num   32561 non-null  float64
 5   marital_status  32561 non-null  int64  
 6   occupation      32561 non-null  int64  
 7   relationship    32561 non-null  int64  
 8   race            32561 non-null  int64  
 9   sex             32561 non-null  int64  
 10  capital_gain    32561 non-null  float64
 11  capital_loss    32561 non-null  float64
 12  hours_per_week  32561 non-null  float64
 13  native_country  32561 non-null  int64  
 14  income          32561 non-null  int64  
dtypes: float64(6), int64(9)
memory usage: 3.7 MB


In [15]:
df.describe()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,0.295639,3.868892,0.120545,10.29821,0.605379,2.611836,6.57274,1.446362,3.665858,0.669205,0.010777,0.020042,0.402423,36.718866,0.24081
std,0.186855,1.45596,0.071685,3.870264,0.171515,1.506222,4.228857,1.606771,0.848806,0.470506,0.073854,0.092507,0.125994,7.823782,0.427581
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.150685,4.0,0.071679,9.0,0.533333,2.0,3.0,0.0,4.0,0.0,0.0,0.0,0.397959,39.0,0.0
50%,0.273973,4.0,0.112788,11.0,0.6,2.0,7.0,1.0,4.0,1.0,0.0,0.0,0.397959,39.0,0.0
75%,0.424658,4.0,0.152651,12.0,0.733333,4.0,10.0,3.0,4.0,1.0,0.0,0.0,0.44898,39.0,0.0
max,1.0,8.0,1.0,15.0,1.0,6.0,14.0,5.0,4.0,1.0,1.0,1.0,1.0,41.0,1.0


In [16]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


In [17]:
# label encoding for categorical variables more than 5 categories
cat_cols_more = cat_counts[cat_counts >=5].index
cat_cols_more

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'native_country'],
      dtype='object')

In [18]:
for col in cat_cols_more:
  df[col] = le.fit_transform(df[col].astype(str))

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,7,0.044302,15,0.8,4,1,1,4,1,0.02174,0.0,0.397959,33,0
1,0.452055,6,0.048238,15,0.8,2,9,0,4,1,0.0,0.0,0.122449,33,0
2,0.287671,4,0.138113,3,0.533333,0,11,1,4,1,0.0,0.0,0.397959,33,0
3,0.493151,4,0.151068,1,0.4,2,11,0,2,1,0.0,0.0,0.397959,33,0
4,0.150685,4,0.221488,15,0.8,2,2,5,2,0,0.0,0.0,0.397959,37,0


#### Pros and Cons of Encoding Techniques

One hot Encoding

Pros:

*  No ordinal relationship assumed
*  Works well for nominal data

*   Improves interpretability

Cons:


*   Increases dimensionality
*   Not suitable for high-cardinality features









Label Encoding

Pros:


*   Simple and fast
*   Memory-efficient

*   Suitable for high-cardinality features

Cons:



*   Introduces false ordering
*   Can mislead linear and distance-based models







### 3. Feature Engineering

In [20]:
def work_hours_level(hours):
    if hours <= 30:
        return "Part_Time"
    elif hours <= 45:
        return "Full_Time"
    else:
        return "Overtime"

df["work_hours_level"] = df["hours_per_week"].apply(work_hours_level)

In [22]:
df["capital_gain_flag"] = (df["capital_gain"] > 0).astype(int)

Two new features were engineered: work_hours_level to capture employment intensity and capital_gain_flag to indicate investment income. These features help the model better identify income-related patterns and improve prediction accuracy.

In [23]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,work_hours_level,capital_gain_flag
0,0.30137,7,0.044302,15,0.8,4,1,1,4,1,0.02174,0.0,0.397959,33,0,Part_Time,1
1,0.452055,6,0.048238,15,0.8,2,9,0,4,1,0.0,0.0,0.122449,33,0,Part_Time,0
2,0.287671,4,0.138113,3,0.533333,0,11,1,4,1,0.0,0.0,0.397959,33,0,Part_Time,0
3,0.493151,4,0.151068,1,0.4,2,11,0,2,1,0.0,0.0,0.397959,33,0,Part_Time,0
4,0.150685,4,0.221488,15,0.8,2,2,5,2,0,0.0,0.0,0.397959,37,0,Part_Time,0


In [24]:
# Applying log transformation
df['capital_gain_log'] = np.log1p(df['capital_gain'])

In [25]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,work_hours_level,capital_gain_flag,capital_gain_log
0,0.30137,7,0.044302,15,0.8,4,1,1,4,1,0.02174,0.0,0.397959,33,0,Part_Time,1,0.021507
1,0.452055,6,0.048238,15,0.8,2,9,0,4,1,0.0,0.0,0.122449,33,0,Part_Time,0,0.0
2,0.287671,4,0.138113,3,0.533333,0,11,1,4,1,0.0,0.0,0.397959,33,0,Part_Time,0,0.0
3,0.493151,4,0.151068,1,0.4,2,11,0,2,1,0.0,0.0,0.397959,33,0,Part_Time,0,0.0
4,0.150685,4,0.221488,15,0.8,2,2,5,2,0,0.0,0.0,0.397959,37,0,Part_Time,0,0.0


A log transformation was applied to the capital-gain feature because it is highly right-skewed. The transformation reduces the effect of extreme values, improves normality, and enhances model performance.