In [1]:
import pandas as pd

In [14]:
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
    'hours-per-week', 'native-country', 'target'
]
data = pd.read_csv('adult.data',header=None, names=column_names)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [15]:
# Basic data exploration
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  target          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [16]:
print(data.describe())


                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [17]:
print(data.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
target            0
dtype: int64


In [18]:
# Handling missing values
data = data.replace(' ?', pd.NA).dropna()

# Convert the target variable to binary
data['target'] = data['target'].apply(lambda x: 1 if x == ' >50K' else 0)

In [19]:
unique_values = data['target'].unique()
unique_values


array([0, 1])

In [20]:
# Encode categorical variables
data_encoded = pd.get_dummies(data, drop_first=True)

# Check correlations again
corr = data_encoded.corr()
target_corr = corr['target'].sort_values(ascending=False)

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the dataset into training and testing sets
X = data_encoded.drop('target', axis=1)
y = data_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a RandomForestClassifier
model = RandomForestClassifier(random_state=42)



In [22]:
# Hyperparameter tuning using GridSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)



In [23]:
# Best parameters
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

best_params, classification_report_result, accuracy

({'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 20},
 '              precision    recall  f1-score   support\n\n           0       0.88      0.95      0.91      4503\n           1       0.79      0.62      0.69      1530\n\n    accuracy                           0.86      6033\n   macro avg       0.84      0.78      0.80      6033\nweighted avg       0.86      0.86      0.86      6033\n',
 0.8622575832918946)

In [24]:
import joblib

joblib.dump(best_model, 'model.joblib')

['model.joblib']

In [25]:
!pip install sagemaker pandas scikit-learn joblib



In [32]:
import joblib

joblib.dump(best_model, 'model.joblib')

['model.joblib']

In [33]:
import tarfile

with tarfile.open('model.tar.gz', mode='w:gz') as archive:
    archive.add('model.joblib')

In [34]:
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()

model_tar_path = session.upload_data('model.tar.gz', bucket=bucket)
print(f"Model uploaded to: {model_tar_path}")

Model uploaded to: s3://sagemaker-ap-south-1-024848449448/data/model.tar.gz


In [None]:
from sagemaker.sklearn.model import SKLearnModel

role = sagemaker.get_execution_role()

model = SKLearnModel(
    model_data=model_tar_path,
    role=role,
    framework_version='0.23-1'
)

# Deploy the model as a SageMaker endpoint within free tier limits
predictor = model.deploy(instance_type='ml.t2.medium', initial_instance_count=1)


In [None]:
# Test the deployed model
import json
import numpy as np

# Prepare a single sample from X_test
sample = X_test[0].reshape(1, -1)
sample_json = json.dumps(sample.tolist())

# Predict using the endpoint
response = predictor.predict(sample_json)
prediction = json.loads(response)
print(f"Prediction: {prediction}")