In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

adult = fetch_ucirepo(id = 2)
x = adult.data.features
y = adult.data.targets.copy()

y['income'] = y['income'].str.strip('.')
y['income'] = (y['income'] == '>50K').astype(int)
y = y['income']
x = x.replace('?', np.nan)

onehot_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
ordinal_features = ['education']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

education_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(
        categories= [['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th','HS-grad', 'Some-college', 'Assoc-voc', 'Assoc-acdm', 'Bachelors', 'Masters', 'Prof-school', 'Doctorate']],
        handle_unknown='use_encoded_value',
        unknown_value=-1
        ))
])

numerical_features = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocesser = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, onehot_features),
        ('education', education_transformer, ordinal_features)
    ],
    remainder = 'drop'
)

lr = LogisticRegression(class_weight='balanced', max_iter=1000)

full_pipeline = Pipeline(steps=[
    ('preprocesser', preprocesser),
    ('classifier', lr)
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

full_pipeline.fit(x_train, y_train)

with open('model.pkl', 'wb') as file:
    pickle.dump(full_pipeline, file)
print("model is saved.")


model is saved.
