In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Basic data exploration: summary statistics, missing values, and data types
summary_stats = df.describe(include='all')
missing_values = df.isnull().sum()
data_types = df.dtypes

In [7]:
# Display the results
print("Summary Statistics:\n", summary_stats)
print("\nMissing Values:\n", missing_values)
print("\nData Types:\n", data_types)


Summary Statistics:
                  age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             marital_status       occupation relationship    race    sex  \
count                 3256

In [9]:
# Impute missing values (Example: filling with mean for numerical and mode for categorical)
#df.fillna(df.mean(), inplace=True)
#df.fillna(df.mode().iloc[0], inplace=True)

In [11]:
# Or drop rows with missing values
df.dropna(inplace=True)

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_cols] = scaler_standard.fit_transform(df[numerical_cols])

In [15]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_cols] = scaler_minmax.fit_transform(df[numerical_cols])

In [17]:
# Display the scaled data
print("Standard Scaled Data:\n", df_standard_scaled.head())
print("\nMin-Max Scaled Data:\n", df_minmax_scaled.head())

Standard Scaled Data:
         age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0  

In [19]:
# One-Hot Encoding
df_onehot = pd.get_dummies(df, columns=['occupation','race'], drop_first=True)

In [21]:
df_onehot

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,relationship,sex,capital_gain,capital_loss,...,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,39,State-gov,77516,Bachelors,13,Never-married,Not-in-family,Male,2174,0,...,False,False,False,False,False,False,False,False,False,True
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Husband,Male,0,0,...,False,False,False,False,False,False,False,False,False,True
2,38,Private,215646,HS-grad,9,Divorced,Not-in-family,Male,0,0,...,False,False,False,False,False,False,False,False,False,True
3,53,Private,234721,11th,7,Married-civ-spouse,Husband,Male,0,0,...,False,False,False,False,False,False,False,True,False,False
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Wife,Female,0,0,...,False,True,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Wife,Female,0,0,...,False,False,False,False,True,False,False,False,False,True
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Husband,Male,0,0,...,False,False,False,False,False,False,False,False,False,True
32558,58,Private,151910,HS-grad,9,Widowed,Unmarried,Female,0,0,...,False,False,False,False,False,False,False,False,False,True
32559,22,Private,201490,HS-grad,9,Never-married,Own-child,Male,0,0,...,False,False,False,False,False,False,False,False,False,True


In [23]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding
label_encoder = LabelEncoder()
df[df_onehot] = label_encoder.fit_transform(df['occupation','race'])

KeyError: ('occupation', 'race')

In [25]:
# Example: Creating a new feature by combining two existing ones
df['new_feature'] = df['feature1'] * df['feature2']

# Another example: Creating a binary feature
df['new_binary_feature'] = df['some_feature'].apply(lambda x: 1 if x > threshold_value else 0)

KeyError: 'feature1'

In [27]:
import numpy as np

# Log transformation of a skewed numerical feature
df['log_transformed_feature'] = np.log1p(df['skewed_feature'])

KeyError: 'skewed_feature'

In [29]:
import ppscore as pps

# Apply PPS to find relationships between features
pps_matrix = pps.matrix(df)

# Display the PPS matrix
print(pps_matrix)

ModuleNotFoundError: No module named 'ppscore'