In [2]:
import pandas as pd
#load the dataset
data = pd.read_csv('/content/adult_with_headers.csv')

In [3]:
# Display basic information
print(data.info())
print(data.describe())
print(data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  3

In [4]:
# Replace '?' with NaN
data.replace('?', pd.NA, inplace=True)

# Impute or drop missing values
data.dropna(inplace=True)  # or use imputation techniques


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [6]:
# Identify numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns


In [7]:
# Standard Scaling
standard_scaler = StandardScaler()
data[numerical_features] = standard_scaler.fit_transform(data[numerical_features])


In [8]:
# Min-Max Scaling
min_max_scaler = MinMaxScaler()
data[numerical_features] = min_max_scaler.fit_transform(data[numerical_features])

In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [10]:
# Identify categorical features
categorical_features = data.select_dtypes(include=['object']).columns


In [11]:

# One-Hot Encoding for categorical variables with < 5 categories
for feature in categorical_features:
    if data[feature].nunique() < 5:
        data = pd.get_dummies(data, columns=[feature], drop_first=True)

In [15]:
# Update categorical_features after One-Hot Encoding
categorical_features = data.select_dtypes(include=['object']).columns # This line was added
# Label Encoding for categorical variables with > 5 categories
label_encoder = LabelEncoder()
for feature in categorical_features:
    if data[feature].nunique() >= 5:
        data[feature] = label_encoder.fit_transform(data[feature])

In [19]:
# Example features
data['age_income_ratio'] = data['age'] / data['fnlwgt']  # Ratio of age to final weight
data['education_num_squared'] = data['education_num'] ** 2  # Square education to capture non-linearity

In [20]:
import numpy as np

# Log transformation on skewed feature (fnlwgt)
data['log_fnlwgt'] = np.log(data['fnlwgt'] + 1)  # Adding 1 to avoid log(0)


In [22]:
from sklearn.ensemble import IsolationForest

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(data[numerical_features])


In [23]:
# Remove outliers
data = data[outliers != -1]


In [1]:
!pip install pandas==1.5.3 # Install pandas version 1.5.3
!pip install --upgrade ppscore # Ensure ppscore is installed with the correct pandas version



In [8]:
import pandas as pd
import ppscore as pps

# Assuming your data is in a CSV file named 'your_data.csv'
data = pd.read_csv('/content/adult_with_headers.csv')  # Load your data into the 'data' variable

# Specify the features you want to analyze
feature_x = 'age'  # Replace with your desired feature
feature_y = 'income' # Replace with your desired feature

# Calculate the PPS between feature_x and feature_y
pps_matrix = pps.matrix(data) # Calculate the PPS matrix for all features
pps_score = pps.score(data, x=feature_x, y=feature_y) #Get a single PPS score

print(f"PPS matrix:\n{pps_matrix}\n")
print(f"PPS score for {feature_x} predicting {feature_y}: {pps_score}")



PPS matrix:
          x               y   ppscore            case  is_valid_score  \
0       age             age  1.000000  predict_itself            True   
1       age       workclass  0.011232  classification            True   
2       age          fnlwgt  0.000000      regression            True   
3       age       education  0.052315  classification            True   
4       age   education_num  0.000000      regression            True   
..      ...             ...       ...             ...             ...   
220  income    capital_gain  0.000000      regression            True   
221  income    capital_loss  0.000000      regression            True   
222  income  hours_per_week  0.000000      regression            True   
223  income  native_country  0.000000  classification            True   
224  income          income  1.000000  predict_itself            True   

                  metric  baseline_score   model_score  \
0                   None        0.000000      1.00000

