# LIBRARIES USED

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
from sklearn.linear_model import LogisticRegression

from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
get_ipython().run_line_magic('matplotlib', 'inline')

# DATA PROCESSING

In [13]:
full_data = pd.read_csv('C:/Users/Asus/Downloads/SWAGAT_INPUT_income_dataset_balanced.csv')

In [14]:
full_data.drop('Id',axis=1,inplace=True)
full_data.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,137594,52,10,single,rented,yes,Technical_writer,Sagar,Madhya_Pradesh,7,10,1
1,46750,33,13,single,rented,yes,Chef,Morbi,Gujarat,10,11,1
2,624985,47,11,single,rented,no,Software_Developer,Varanasi,Uttar_Pradesh,6,11,1
3,233888,64,1,single,rented,no,Psychologist,Tumkur,Karnataka,1,10,1
4,979926,61,14,single,rented,yes,Police_officer,Indore,Madhya_Pradesh,7,11,1


In [15]:
full_data.shape

(34904, 12)

In [16]:
# Remove underscores from values in the dataframe
full_data = full_data.applymap(lambda x: x.replace('_', '') if isinstance(x, str) else x)


# ONE HOT ENCODING

In [17]:
cols_to_encode = ['Married/Single','House_Ownership', 'Car_Ownership','Profession','CITY','STATE']
dummies = pd.get_dummies(full_data[cols_to_encode], drop_first=True)
dummies = dummies.astype(int)
dummies.shape

(34904, 445)

In [18]:
dummies.head()

Unnamed: 0,Married/Single_single,House_Ownership_owned,House_Ownership_rented,Car_Ownership_yes,Profession_Airtrafficcontroller,Profession_Analyst,Profession_Architect,Profession_Army officer,Profession_Armyofficer,Profession_Artist,...,STATE_TamilNadu,STATE_Telangana,STATE_Tripura,STATE_Uttar Pradesh,STATE_Uttar Pradesh[5],STATE_UttarPradesh,STATE_UttarPradesh[5],STATE_Uttarakhand,STATE_West Bengal,STATE_WestBengal
0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
full_data.drop(cols_to_encode, axis=1, inplace=True)
full_data.head()

Unnamed: 0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,137594,52,10,7,10,1
1,46750,33,13,10,11,1
2,624985,47,11,6,11,1
3,233888,64,1,1,10,1
4,979926,61,14,7,11,1


# DATA NORMALIZATION

In [20]:
scale = MinMaxScaler()
scalled = scale.fit_transform(full_data.drop('Risk_Flag',axis=1))

In [21]:
i = 0
for col in full_data.columns[:-1]:
    full_data[col] = scalled[:,i]
    i += 1

In [22]:
full_data.head()

Unnamed: 0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,0.012743,0.534483,0.5,0.5,0.0,1
1,0.003648,0.206897,0.65,0.714286,0.25,1
2,0.061536,0.448276,0.55,0.428571,0.25,1
3,0.022383,0.741379,0.05,0.071429,0.0,1
4,0.09707,0.689655,0.7,0.5,0.25,1


CHECK IF DATA HAS DUPICATE COLUMNS

In [23]:
print(full_data.index.duplicated().any())
print(dummies.index.duplicated().any())

False
False


In [24]:
full_data = full_data.reset_index(drop=True)  # Reset index to make it unique
extra = dummies.reset_index(drop=True)  # Reset index to make it unique

In [25]:
print(full_data.index.duplicated().any())
print(dummies.index.duplicated().any())

False
False


# RESTROING FULL DATA TABLE

In [26]:
full_data = pd.concat([full_data, dummies], axis=1)
full_data.head()

Unnamed: 0,Income,Age,Experience,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,Married/Single_single,House_Ownership_owned,House_Ownership_rented,Car_Ownership_yes,...,STATE_TamilNadu,STATE_Telangana,STATE_Tripura,STATE_Uttar Pradesh,STATE_Uttar Pradesh[5],STATE_UttarPradesh,STATE_UttarPradesh[5],STATE_Uttarakhand,STATE_West Bengal,STATE_WestBengal
0,0.012743,0.534483,0.5,0.5,0.0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0.003648,0.206897,0.65,0.714286,0.25,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0.061536,0.448276,0.55,0.428571,0.25,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0.022383,0.741379,0.05,0.071429,0.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.09707,0.689655,0.7,0.5,0.25,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# BALANCING DATA

In [27]:
full_data['Risk_Flag'].value_counts()

Risk_Flag
1    17452
0    17452
Name: count, dtype: int64

In [28]:
class0 = full_data[full_data['Risk_Flag'] == 0].sample(17452)
class1 = full_data[full_data['Risk_Flag'] == 1].sample(17452)

In [29]:
full_data2 = pd.concat([class0,class1],axis = 0)
full_data2.shape

(34904, 451)

In [30]:
x,y = full_data2.drop('Risk_Flag',axis = 1),full_data2['Risk_Flag']
x.shape, y.shape

((34904, 450), (34904,))

# TRAIN AND TEST DATA SPLIT

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y,random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((26178, 450), (8726, 450), (26178,), (8726,))

# EXECUTING THE MODEL

In [32]:
model = [LogisticRegression]

In [35]:
model = keras.Sequential([
    keras.layers.Dense(450, input_shape=(450,), activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [36]:
model.fit(X_train, Y_train, epochs=150,batch_size=1024)

Epoch 1/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6003 - loss: 0.6770
Epoch 2/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6984 - loss: 0.6166
Epoch 3/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7151 - loss: 0.5773
Epoch 4/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7241 - loss: 0.5557
Epoch 5/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7345 - loss: 0.5437
Epoch 6/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7419 - loss: 0.5317
Epoch 7/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7524 - loss: 0.5172
Epoch 8/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7601 - loss: 0.5054
Epoch 9/150
[1m26/26[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x13274036410>

# EVALUATING THE ACCURACY

In [37]:
model.evaluate(X_test, Y_test)

[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 819us/step - accuracy: 0.7780 - loss: 0.7677


[0.775320291519165, 0.7763007283210754]