In [41]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [43]:
column_names = [ 'age', 'workclass','fnlwgt','education','education-num',
                'marital-status','occupation','relationship','race',
                'sex','capital-gain','capital-loss',
                'house-per-week', 'native-country',
                'class']


In [71]:
train_df=pd.read_csv('adult.data',header=None, names=column_names, na_values=' ?')
test_df=pd.read_csv('adult.test',skiprows=1,header=None, names=column_names,na_values = ' ?')

In [73]:
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,house-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [75]:
test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,house-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K.


### 1. Data Preprocessing

In [77]:
train_df['class']=train_df['class'].str.strip()
print(train_df.head())

print("\n-----------------------------------------\n")

test_df['class']=test_df['class'].str.strip().str.rstrip('.')
print(test_df.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  house-per-week  native-country  class  
0          2174             0              40   United-States  <=50K 

In [106]:
train_df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
house-per-week       0
native-country     583
class                0
dtype: int64

In [112]:
test_df.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
house-per-week      0
native-country    274
class               0
dtype: int64

In [114]:
X_train=train_df.drop('class',axis=1)
X_test=test_df.drop('class',axis=1)
y_train=train_df['class']
y_test=test_df['class']

In [94]:
numerical_cols=X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols=X_train.select_dtypes(include=['object']).columns.tolist()

In [96]:
#Handling missing values

numerical_pipeline=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline=Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]) 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

In [121]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [118]:
for name, model in models.items():
    # Create pipeline: preprocess + model
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    
    # Fit the model
    clf.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = clf.predict(X_test)
    
    # Evaluate
    print(f"\n{name} Evaluation:")
    print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred, pos_label='>50K'), 4))
    print("Recall   :", round(recall_score(y_test, y_pred, pos_label='>50K'), 4))
    print("F1-Score :", round(f1_score(y_test, y_pred, pos_label='>50K'), 4))



Logistic Regression Evaluation:
Accuracy : 0.8507
Precision: 0.7282
Recall   : 0.5871
F1-Score : 0.6501

Decision Tree Evaluation:
Accuracy : 0.8149
Precision: 0.6069
Recall   : 0.6147
F1-Score : 0.6108

Random Forest Evaluation:
Accuracy : 0.8515
Precision: 0.7176
Recall   : 0.6126
F1-Score : 0.661
