In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# dataset Info
base_dir = '/content/drive/MyDrive/Capstone/Recommendation System'
df_dir = 'Dummy_Data.xlsx'
df_dir = os.path.join(base_dir, df_dir)
dataset = pd.read_excel(df_dir)

# Showing first 5 data
print(dataset.head())

     id  jenis kelamin  umur                prov   pekerjaan      gaji  \
0  2167              0    22    SUMATERA SELATAN       Hakim     kecil   
1  8819              1    30  KALIMANTAN SELATAN        Guru  menengah   
2  4177              1    47          JAWA TIMUR    Wartawan      atas   
3  4239              1    37      SUMATERA BARAT     Tentara  menengah   
4  6007              0    52              MALUKU  Pustakawan     kecil   

     letak     label  
0  outdoor      sawi  
1  outdoor  kangkung  
2   indoor      sawi  
3   indoor    selada  
4  outdoor  kangkung  


In [5]:
print(dataset.columns)

Index(['id', 'jenis kelamin', 'umur', 'prov', 'pekerjaan', 'gaji', 'letak',
       'label'],
      dtype='object')


In [4]:
# Encoding the categorical data
label_encoders = {}
for column in ['jenis kelamin', 'prov', 'pekerjaan', 'gaji', 'letak', 'label']:
    le = LabelEncoder()
    dataset[column] = le.fit_transform(dataset[column])
    label_encoders[column] = le

In [11]:
# Spliting Data into features and labels
X = dataset.drop(['label', 'id'], axis = 1)
Y = dataset['label']

In [12]:
# Spliting Data into trains and tests
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
X_train.head()

566853    11
382311     5
241519     7
719220    17
905718    20
Name: pekerjaan, dtype: int64

In [11]:
# Show The Original Index
for column in dataset.columns[3:]:
  encoded_values = dataset[column].unique()
  original_values = label_encoders[column].inverse_transform(encoded_values)
  print(f'\nEncoded to Original Mapping for {column}:')
  for encoded, original in zip(encoded_values, original_values):
    print(f"{encoded}: {original}")


Encoded to Original Mapping for prov:
32: SUMATERA SELATAN
12: KALIMANTAN SELATAN
10: JAWA TIMUR
31: SUMATERA BARAT
19: MALUKU
22: NUSA TENGGARA TIMUR
18: LAMPUNG
15: KALIMANTAN UTARA
23: PAPUA
17: KEP. RIAU
6: GORONTALO
14: KALIMANTAN TIMUR
9: JAWA TENGAH
13: KALIMANTAN TENGAH
4: DI YOGYAKARTA
33: SUMATERA UTARA
1: BALI
11: KALIMANTAN BARAT
28: SULAWESI TENGAH
8: JAWA BARAT
5: DKI JAKARTA
0: ACEH
25: RIAU
3: BENGKULU
30: SULAWESI UTARA
27: SULAWESI SELATAN
24: PAPUA BARAT
29: SULAWESI TENGGARA
16: KEP. BANGKA BELITUNG
7: JAMBI
26: SULAWESI BARAT
21: NUSA TENGGARA BARAT
20: MALUKU UTARA
2: BANTEN

Encoded to Original Mapping for pekerjaan:
7: Hakim
6: Guru
24: Wartawan
22: Tentara
20: Pustakawan
3: Artis
8: Insinyur Mesin
17: Perawat
14: Pemadam Kebakaran
12: Nelayan
16: Penulis
13: Notaris
21: Teller Bank
1: Akuntan
5: Dokter
23: Tukang Cukur
18: Petani
19: Pilot
0: Ahli Gizi
9: Koki
10: Kondektur
11: Masinis
4: Bidan
2: Arsitek
15: Penerjemah

Encoded to Original Mapping for gaji:
1

In [18]:
print(X_train.shape)
print(Y_train.shape)

(800000,)
(800000,)


In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(1,)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    # tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(4, activation='softmax')
])
# Mengkompilasi model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.01),
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Melatih model
history = model.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test))

Epoch 1/10
  362/25000 [..............................] - ETA: 1:35 - loss: 1.4292 - accuracy: 0.2417

KeyboardInterrupt: 