In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
from tensorflow.keras.activations import relu,sigmoid,linear
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [50]:
data = pd.read_csv('data2.csv')
data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [51]:
#no null values, duplicates can exist as data is not very specific
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [53]:
data['income'].replace(to_replace=['<=50K','>50K'], value=[0,1],inplace=True)
data['income'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['income'].replace(to_replace=['<=50K','>50K'], value=[0,1],inplace=True)
  data['income'].replace(to_replace=['<=50K','>50K'], value=[0,1],inplace=True)


income
0    24720
1     7841
Name: count, dtype: int64

In [54]:
x = data.drop(['income','education','native.country','relationship','marital.status','occupation','workclass'], axis = 1)
x

Unnamed: 0,age,fnlwgt,education.num,race,sex,capital.gain,capital.loss,hours.per.week
0,90,77053,9,White,Female,0,4356,40
1,82,132870,9,White,Female,0,4356,18
2,66,186061,10,Black,Female,0,4356,40
3,54,140359,4,White,Female,0,3900,40
4,41,264663,10,White,Female,0,3900,40
...,...,...,...,...,...,...,...,...
32556,22,310152,10,White,Male,0,0,40
32557,27,257302,12,White,Female,0,0,38
32558,40,154374,9,White,Male,0,0,40
32559,58,151910,9,White,Female,0,0,40


In [55]:
x['race'].value_counts()

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

In [56]:
x['race'].replace(to_replace=['White','Black','Asian-Pac-Islander','Amer-Indian-Eskimo','Other'], value=[1,2,3,4,5], inplace=True)
x['race'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['race'].replace(to_replace=['White','Black','Asian-Pac-Islander','Amer-Indian-Eskimo','Other'], value=[1,2,3,4,5], inplace=True)
  x['race'].replace(to_replace=['White','Black','Asian-Pac-Islander','Amer-Indian-Eskimo','Other'], value=[1,2,3,4,5], inplace=True)


race
1    27816
2     3124
3     1039
4      311
5      271
Name: count, dtype: int64

In [57]:
x['sex'].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

In [58]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
one_hot_encoder = encoder.fit_transform(x[['sex']])
encoded_data = pd.DataFrame(one_hot_encoder,columns=encoder.get_feature_names_out(['sex']))

x_encoded = pd.concat([x, encoded_data], axis=1)
x_encoded.drop(['sex'],axis = 1,inplace=True)
x_encoded

Unnamed: 0,age,fnlwgt,education.num,race,capital.gain,capital.loss,hours.per.week,sex_Female,sex_Male
0,90,77053,9,1,0,4356,40,1.0,0.0
1,82,132870,9,1,0,4356,18,1.0,0.0
2,66,186061,10,2,0,4356,40,1.0,0.0
3,54,140359,4,1,0,3900,40,1.0,0.0
4,41,264663,10,1,0,3900,40,1.0,0.0
...,...,...,...,...,...,...,...,...,...
32556,22,310152,10,1,0,0,40,0.0,1.0
32557,27,257302,12,1,0,0,38,1.0,0.0
32558,40,154374,9,1,0,0,40,0.0,1.0
32559,58,151910,9,1,0,0,40,1.0,0.0


In [59]:
y = data['income']
y.value_counts()

income
0    24720
1     7841
Name: count, dtype: int64

In [60]:
x_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  int64  
 1   fnlwgt          32561 non-null  int64  
 2   education.num   32561 non-null  int64  
 3   race            32561 non-null  int64  
 4   capital.gain    32561 non-null  int64  
 5   capital.loss    32561 non-null  int64  
 6   hours.per.week  32561 non-null  int64  
 7   sex_Female      32561 non-null  float64
 8   sex_Male        32561 non-null  float64
dtypes: float64(2), int64(7)
memory usage: 2.2 MB


In [61]:
x_encoded.corr()

Unnamed: 0,age,fnlwgt,education.num,race,capital.gain,capital.loss,hours.per.week,sex_Female,sex_Male
age,1.0,-0.076646,0.036527,-0.04,0.077674,0.057775,0.068756,-0.088832,0.088832
fnlwgt,-0.076646,1.0,-0.043195,0.0005,0.000432,-0.010252,-0.018768,-0.026858,0.026858
education.num,0.036527,-0.043195,1.0,-0.03976,0.12263,0.079923,0.148123,-0.01228,0.01228
race,-0.04,0.0005,-0.03976,1.0,-0.007991,-0.01719,-0.033115,0.06784,-0.06784
capital.gain,0.077674,0.000432,0.12263,-0.007991,1.0,-0.031615,0.078409,-0.04848,0.04848
capital.loss,0.057775,-0.010252,0.079923,-0.01719,-0.031615,1.0,0.054256,-0.045567,0.045567
hours.per.week,0.068756,-0.018768,0.148123,-0.033115,0.078409,0.054256,1.0,-0.229309,0.229309
sex_Female,-0.088832,-0.026858,-0.01228,0.06784,-0.04848,-0.045567,-0.229309,1.0,-1.0
sex_Male,0.088832,0.026858,0.01228,-0.06784,0.04848,0.045567,0.229309,-1.0,1.0


In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_encoded,y,train_size=0.2,random_state=100)

In [63]:
from sklearn.preprocessing import StandardScaler

X_train_scaler = StandardScaler().fit(X_train)
X_train_norm = X_train_scaler.fit_transform(X_train)

X_test_scaler = StandardScaler().fit(X_test)
X_test_norm = X_test_scaler.fit_transform(X_test)

In [64]:
model = Sequential([
    tf.keras.Input(shape=(9,)),
    Dense(10, activation='relu', ),
    Dense(5, activation='relu'),
    Dense(1, activation='linear')
])

In [65]:
model.summary()

In [66]:
model.compile(loss=BinaryCrossentropy(from_logits=True),optimizer=Adam(),metrics=['accuracy'])

In [67]:
model.fit(X_train_norm, y_train, epochs=20)

Epoch 1/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 842us/step - accuracy: 0.7604 - loss: 0.6155
Epoch 2/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step - accuracy: 0.7733 - loss: 0.4504
Epoch 3/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - accuracy: 0.7902 - loss: 0.4158
Epoch 4/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8081 - loss: 0.4008
Epoch 5/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653us/step - accuracy: 0.8122 - loss: 0.3962
Epoch 6/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8167 - loss: 0.3968
Epoch 7/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8227 - loss: 0.3901
Epoch 8/20
[1m204/204[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 990us/step - accuracy: 0.8251 - loss: 0.3804
Epoch 9/20
[1m204/204[0m [3

<keras.src.callbacks.history.History at 0x11c9be0b740>

In [68]:
result = model.predict(X_test_norm)

[1m815/815[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 464us/step


In [69]:
prediction = tf.round(tf.nn.sigmoid(result))

In [70]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(prediction, y_test)
accuracy

0.8295136089677146