### Importing required libraries:

In [215]:
import pandas as pd
import numpy as np

import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.compose import ColumnTransformer

### Reading csv, Dropping unwanted columns and Mapping Occupaion:

In [216]:
df = pd.read_csv("atlantis_citizens_final.csv");

df.drop(columns=["Citizen_ID","Bio_Hash"],inplace=True)

occ_map = {"Warrior":0,"Merchant":1,"Fisher":2,"Miner":3,"Scribe":4}

df["Occupation"]=df["Occupation"].map(occ_map)

### One Hot Encoding Categorical Data:

In [217]:
df["Out_for_Work"]=df["District_Name"]!=df["Work_District"]
df["Out_for_Work"]=df["Out_for_Work"].map({True:1,False:0})

df = pd.get_dummies(df, columns=['District_Name', 'Work_District','Diet_Type','Vehicle_Owned'], drop_first=False, dtype=int)

Going out for work is a characteristic for certain occupations so new column "Out_for_Work" created. 
'District_Name', 'Work_District','Diet_Type','Vehicle_Owned' are one-hot encoded.

### Data Imputation:

In [218]:
for occ in df["Occupation"].unique():
    mask = (df["Occupation"]==occ) & (df["Wealth_Index"].isna())
    df.loc[mask,"Wealth_Index"] = df["Wealth_Index"].fillna(df[df["Occupation"]==occ]["Wealth_Index"].mean())

df["Life_Expectancy"]=df["Life_Expectancy"].fillna(df["Life_Expectancy"].median())
df["House_Size_sq_ft"]=df["House_Size_sq_ft"].fillna(df["House_Size_sq_ft"].mean())

1. Missing values in Wealth Index filled using occupation wise mean (As Merchants are rich and wealth is their characteristic.)
2. Missing values in Life_Expectancy and House_Size filled using median and mean values.

### Splitting training and testing data:

In [219]:
x=df.drop("Occupation",axis=1,inplace=False)
y=df["Occupation"]

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=15)

30% of data used for Testing/Evaluation.

### Data Scaling:

In [None]:
columns_to_scale = ["Wealth_Index","House_Size_sq_ft","Life_Expectancy"]

preprocessor = ColumnTransformer(
    transformers=[('scaler', preprocessing.StandardScaler(), columns_to_scale)],
    remainder='passthrough'
)
preprocessor.fit(xtrain)
xtrain=preprocessor.transform(xtrain)
xtest=preprocessor.transform(xtest)

Data Scaling applied on "Wealth_Index","House_Size_sq_ft" and "Life_Expectancy" columns.

### Neural Network:

#### Model Structure and Training:

Model has been implemented with callbacks for finding best weights.

In [221]:
model = keras.Sequential([
    keras.layers.Input(shape=(xtrain.shape[1],)),

    keras.layers.Dense(256),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(128),
    keras.layers.BatchNormalization(),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(5,activation="softmax")
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=8, 
    restore_best_weights=True
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5, 
    patience=4, 
    min_lr=0.00001
)

model.fit(xtrain,ytrain,epochs=100,callbacks=[early_stopping,reduce_lr],validation_split=0.2)

Epoch 1/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5313 - loss: 1.1374 - val_accuracy: 0.5946 - val_loss: 0.9860 - learning_rate: 0.0010
Epoch 2/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5787 - loss: 1.0060 - val_accuracy: 0.6218 - val_loss: 0.9054 - learning_rate: 0.0010
Epoch 3/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5830 - loss: 0.9855 - val_accuracy: 0.6222 - val_loss: 0.9028 - learning_rate: 0.0010
Epoch 4/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5983 - loss: 0.9562 - val_accuracy: 0.6231 - val_loss: 0.8983 - learning_rate: 0.0010
Epoch 5/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6033 - loss: 0.9492 - val_accuracy: 0.6186 - val_loss: 0.8984 - learning_rate: 0.0010
Epoch 6/100
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x2b183ddb4d0>

#### Model Evaluation:

In [222]:
model.evaluate(xtest,ytest)

[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 690us/step - accuracy: 0.6310 - loss: 0.8792


[0.8792304396629333, 0.6309775710105896]

63% Accuracy

### Creating prediction:

##### Reading test csv:

In [204]:
dft=pd.read_csv("test_atlantis_hidden.csv");
cid=dft["Citizen_ID"]
dft.drop(columns=["Citizen_ID","Bio_Hash"],inplace=True)

##### Processing data:

In [205]:
dft["Out_for_Work"]=dft["District_Name"]!=dft["Work_District"]
dft["Out_for_Work"]=dft["Out_for_Work"].map({True:1,False:0})

dft = pd.get_dummies(dft, columns=['District_Name', 'Work_District','Diet_Type','Vehicle_Owned'], drop_first=False, dtype=int)

dft=preprocessor.transform(dft)

##### Creating submission csv:

In [206]:
ans = model.predict(dft)

occs=list()
for arr in ans:
    occs.append(np.argmax(arr))

submit=pd.DataFrame()
submit["Citizen_ID"]=cid

submit["Occupation"]=np.array(occs)
submit.to_csv("submit21.csv",index=False)

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 901us/step
