In [3]:
from google.colab import files
uploaded =files.upload()

Saving adult_with_headers.csv to adult_with_headers.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps  # Install using: pip install ppscore

# Load the dataset
file_path = "/content/adult_with_headers.csv"  # Update with the correct file path
df = pd.read_csv(file_path)

print("Dataset Info:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())


# Fill missing values with mode for categorical and median for numerical columns
for col in df.columns:
    if df[col].dtype == "object":  # Categorical columns
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:  # Numerical columns
        df[col].fillna(df[col].median(), inplace=True)

print("\nMissing Values After Handling:")
print(df.isnull().sum())


num_features = df.select_dtypes(include=['int64', 'float64']).columns

# Standard Scaling
scaler_standard = StandardScaler()
df[num_features] = scaler_standard.fit_transform(df[num_features])

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[num_features] = scaler_minmax.fit_transform(df[num_features])

print("\nNumerical Features After Scaling:")
print(df[num_features].head())


# One-Hot Encoding for categorical variables with ≤5 unique values
df = pd.get_dummies(df, columns=[col for col in df.columns if df[col].nunique() <= 5])

# Label Encoding for categorical variables with >5 unique values
label_enc = LabelEncoder()
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = label_enc.fit_transform(df[col])

print("\nCategorical Features Encoded Successfully.")


# Creating a new feature: Work Experience (years worked)
df["work_experience"] = df["age"] - df["education_num"]

# Log transformation on "capital-gain" to reduce skewness
df["capital_gain_log"] = np.log1p(df["capital_gain"])

print("\nNew Features Added:")
print(df[["work_experience", "capital_gain", "capital_gain_log"]].head())


# Detecting outliers with Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df[num_features])

# Removing outliers
df = df[outliers == 1]

# PPS Score Analysis
pps_matrix = pps.matrix(df)
print("\nPPS Score Analysis:")
print(pps_matrix)

# Save processed data
df.to_csv("processed_adult_dataset.csv", index=False)
print("\nProcessed dataset saved successfully!")


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

Summary Statistics:
                age        fnlwgt  education_num  cap

In [1]:
pip install pandas numpy scikit-learn ppscore


