In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Loading the dataset
data = pd.read_csv("adult.csv")

In [4]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [5]:
print('age ->',min(set(data['age'])),'-',max(set(data['age'])))

age -> 17 - 90


In [6]:
print('workclass ->',set(data[data.columns[1]]))
print('occupation ->',set(data[data.columns[6]]))
print('native-country ->', set(data[data.columns[13]]))

workclass -> {'Never-worked', 'Local-gov', 'State-gov', 'Federal-gov', 'Private', 'Self-emp-inc', '?', 'Without-pay', 'Self-emp-not-inc'}
occupation -> {'Machine-op-inspct', 'Prof-specialty', 'Sales', 'Armed-Forces', 'Tech-support', 'Transport-moving', 'Other-service', '?', 'Craft-repair', 'Farming-fishing', 'Handlers-cleaners', 'Exec-managerial', 'Adm-clerical', 'Protective-serv', 'Priv-house-serv'}
native-country -> {'Mexico', 'Holand-Netherlands', '?', 'Scotland', 'India', 'England', 'Dominican-Republic', 'Germany', 'Columbia', 'Hong', 'Philippines', 'Japan', 'Trinadad&Tobago', 'Peru', 'Taiwan', 'Cuba', 'Italy', 'Hungary', 'Outlying-US(Guam-USVI-etc)', 'Haiti', 'Canada', 'South', 'Nicaragua', 'Guatemala', 'Iran', 'Poland', 'Honduras', 'Thailand', 'Puerto-Rico', 'Laos', 'Greece', 'Vietnam', 'France', 'United-States', 'Cambodia', 'Portugal', 'China', 'Jamaica', 'El-Salvador', 'Yugoslavia', 'Ecuador', 'Ireland'}


In [7]:
# Handling missing values (?)
data.replace('?', np.nan, inplace=True)
data['workclass'].fillna('Others', inplace=True)
data['occupation'].fillna('Others', inplace=True)
data['native-country'].fillna('Others', inplace=True)

In [8]:
# Removing outliers based on age
data = data[(data['age'] >= 17) & (data['age'] <= 75)]

In [9]:
# Removing low-count categories
data = data[~data['workclass'].isin(['Without-pay', 'Never-worked'])]
data = data[~data['education'].isin(['1st-4th', '5th-6th', 'Preschool'])]

In [10]:
# Dropping redundant 'education' column since 'educational-num' provides similar information
data.drop(columns=['education'], inplace=True)

In [11]:
# Encoding categorical variables
categorical_cols = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col].astype(str))

In [12]:
# Saving the encoder
joblib.dump(encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [13]:
# Splitting features and target
X = data.drop(columns=['income'])
y = data['income'].map({'<=50K': 0, '>50K': 1})  # Convert to binaryb

In [14]:
# Scaling numerical features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [15]:
# Saving the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [16]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=23, stratify=y)


In [17]:
# Training Random Forest with GridSearchCV
rf = RandomForestClassifier(random_state=23)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)


0,1,2
,estimator,RandomForestC...ndom_state=23)
,param_grid,"{'max_depth': [10, 20, ...], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], 'n_estimators': [100, 200]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [18]:
# Best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


In [19]:
# Saving the model
joblib.dump(best_model, 'random_forest_model.pkl')

['random_forest_model.pkl']