In [2]:
import pandas as pd
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num",
    "marital_status", "occupation", "relationship", "race", "sex",
    "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"
]
df = pd.read_csv("C:\\Users\\poorv\\Downloads\\adult\\adult.data", header = None , names= columns, skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.shape

(32561, 15)

In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [5]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [6]:
df["sex"].value_counts()


sex
Male      21790
Female    10771
Name: count, dtype: int64

In [7]:
pd.crosstab(df["sex"], df["income"], normalize="index")

income,<=50K,>50K
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.890539,0.109461
Male,0.694263,0.305737


In [9]:
(df == "?").sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [10]:
import numpy as np
df = df.replace("?",np.nan)

In [11]:
df_clean = df.dropna()
df_clean.shape

(30162, 15)

In [20]:
X = df_clean.drop("income", axis=1)
y = df_clean["income"]

In [21]:
y = y.apply(lambda x: 1 if x.strip() == ">50K" else 0)
y.value_counts()


income
0    22654
1     7508
Name: count, dtype: int64

In [22]:
X_encoded = pd.get_dummies(X , drop_first= True)
X_encoded.shape

(30162, 96)

In [25]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X_encoded, y , random_state= 42 , test_size= 0.2 , stratify=y)

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    solver="liblinear",
    max_iter=1000,
    class_weight="balanced"
)

model.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,1000


In [29]:
y_pred = model.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8054036134593071
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      4531
           1       0.58      0.83      0.68      1502

    accuracy                           0.81      6033
   macro avg       0.75      0.81      0.77      6033
weighted avg       0.84      0.81      0.82      6033



In [30]:
test_df = X_test.copy()
test_df["true_income"] = y_test.values
test_df["pred_income"] = y_pred


In [31]:
sex_test = df_clean.loc[X_test.index, "sex"]
test_df["sex"] = sex_test.values


In [32]:
test_df.groupby("sex").apply(
    lambda x: (x["true_income"] == x["pred_income"]).mean()
)


  test_df.groupby("sex").apply(


sex
Female    0.900915
Male      0.759164
dtype: float64

In [33]:
test_df.groupby("sex")["pred_income"].mean()


sex
Female    0.147358
Male      0.458057
Name: pred_income, dtype: float64

In [34]:
test_df[test_df["true_income"] == 1].groupby("sex")["pred_income"].mean()


sex
Female    0.702128
Male      0.848461
Name: pred_income, dtype: float64

In [35]:
model_unweighted = LogisticRegression(
    solver="liblinear",
    max_iter=1000
)
model_unweighted.fit(X_train_scaled, y_train)

y_pred_unweighted = model_unweighted.predict(X_test_scaled)


In [36]:

test_df["pred_income_weighted"] = y_pred


test_df["pred_income_unweighted"] = y_pred_unweighted


In [37]:
test_df.groupby("sex")[["pred_income_unweighted", "pred_income_weighted"]].mean()


Unnamed: 0_level_0,pred_income_unweighted,pred_income_weighted
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.090955,0.147358
Male,0.261009,0.458057


In [38]:
test_df[test_df["true_income"] == 1].groupby("sex")[
    ["pred_income_unweighted", "pred_income_weighted"]
].mean()


Unnamed: 0_level_0,pred_income_unweighted,pred_income_weighted
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.548936,0.702128
Male,0.616417,0.848461
