In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("/content/drive/MyDrive/SIM_Dataset/SIMPrice.csv")

In [4]:
df

Unnamed: 0,sim_number,price_vnd
0,343189288,450000
1,888899580,3000000
2,928960006,500000
3,902438679,5000000
4,334307889,450000
...,...,...
199995,866161769,450000
199996,708124126,1000000
199997,904755200,1000000
199998,329220204,450000


In [5]:
df.describe()

Unnamed: 0,sim_number,price_vnd
count,200000.0,200000.0
mean,730479700.0,13950270.0
std,240647900.0,591299700.0
min,325009800.0,99000.0
25%,392260500.0,500000.0
50%,833428700.0,1000000.0
75%,918087400.0,5000000.0
max,997979700.0,168000000000.0


In [6]:
df[df["price_vnd"]<10000000]["price_vnd"].value_counts()

450000     45119
1000000    38472
500000     30619
3000000    29199
5000000    21383
399000       138
299000        91
99000         58
250000        47
119000        32
350000        22
199000        18
290000         9
390000         3
280000         3
400000         2
220000         1
Name: price_vnd, dtype: int64

In [7]:
df[df["price_vnd"]>10000000]["price_vnd"].value_counts()

12000000     5124
11325000     2682
11000000     1760
13000000     1720
15000000     1417
             ... 
130500000       1
12150000        1
80100000        1
424000000       1
439000000       1
Name: price_vnd, Length: 921, dtype: int64

In [8]:
def get_sim_price_category(x):
  if x["price_vnd"]<=450000:
    return 0
  elif x["price_vnd"]==500000:
    return 1
  elif x["price_vnd"]==1000000:
    return 2
  elif x["price_vnd"]==3000000:
    return 3
  elif x["price_vnd"]==5000000:
    return 4
  else:
    return 5

df["sim_price_cat"] = df.apply(get_sim_price_category, axis=1)

In [9]:
df

Unnamed: 0,sim_number,price_vnd,sim_price_cat
0,343189288,450000,0
1,888899580,3000000,3
2,928960006,500000,1
3,902438679,5000000,4
4,334307889,450000,0
...,...,...,...
199995,866161769,450000,0
199996,708124126,1000000,2
199997,904755200,1000000,2
199998,329220204,450000,0


In [10]:
df["sim_price_cat"].value_counts()

0    45543
2    38472
5    34784
1    30619
3    29199
4    21383
Name: sim_price_cat, dtype: int64

In [11]:
X = []
y= []

for index, row in df.iterrows():
  X.append([int(c) for c in str(row["sim_number"])])
  y.append(row["sim_price_cat"])

X, y = np.array(X), np.array(y)

In [12]:
X.shape

(200000, 9)

In [13]:
y.shape

(200000,)

In [14]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

le = LabelEncoder()
integer_encoded = le.fit_transform(y)

oe = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
y_one_hot = oe.fit_transform(integer_encoded)
print(y_one_hot)

[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 ...
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.1, random_state=42)

In [16]:
X_train.shape

(180000, 9)

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam

In [18]:
model = Sequential()

model.add(LSTM(units=512, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=512, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=512))
model.add(Dense(units=6, activation="softmax"))

model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer=Adam())

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 9, 512)            1052672   
                                                                 
 dropout (Dropout)           (None, 9, 512)            0         
                                                                 
 lstm_1 (LSTM)               (None, 9, 512)            2099200   
                                                                 
 dropout_1 (Dropout)         (None, 9, 512)            0         
                                                                 
 lstm_2 (LSTM)               (None, 9, 512)            2099200   
                                                                 
 dropout_2 (Dropout)         (None, 9, 512)            0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               3

In [21]:
'''
from tensorflow.keras.callbacks import ModelCheckpoint
model_checkpoint_callback = ModelCheckpoint(
    filepath="ckpt_best.hdf5",
    save_weights_only=True,
    monitor='val_accuracy',
    save_best_only=True, verbose=1)

history = model.fit(X_train, y_train, epochs = 30, batch_size = 64, validation_data = (X_test, y_test), callbacks = [model_checkpoint_callback])
'''

'\nfrom tensorflow.keras.callbacks import ModelCheckpoint\nmodel_checkpoint_callback = ModelCheckpoint(\n    filepath="ckpt_best.hdf5",\n    save_weights_only=True,\n    monitor=\'val_accuracy\',\n    save_best_only=True, verbose=1)\n\nhistory = model.fit(X_train, y_train, epochs = 30, batch_size = 64, validation_data = (X_test, y_test), callbacks = [model_checkpoint_callback])\n'

In [22]:
model.load_weights("/content/drive/MyDrive/SIM_Dataset/ckpt_best.hdf5")

In [23]:
import random

for i in range(0,5):
  idx = random.randint(0,20000)
  result = model.predict(np.expand_dims(X_test[idx], axis=0))
  print("Số sim = ", X_test[idx])
  print("Kết quả dự đoán = ", result)
  print("Index dự đoán = ", np.argmax(result))
  print("Giá trị thật = ", y_test[idx])
  print("Đúng/sai = ", np.argmax(result) == np.argmax(y_test[idx]))

Số sim =  [7 6 8 6 3 9 2 2 2]
Kết quả dự đoán =  [[1.4708489e-06 2.9330683e-04 1.8189149e-02 9.1940480e-01 6.1615564e-02
  4.9563154e-04]]
Index dự đoán =  3
Giá trị thật =  [0. 0. 0. 1. 0. 0.]
Đúng/sai =  True
Số sim =  [5 6 6 5 5 4 5 2 2]
Kết quả dự đoán =  [[6.3531193e-06 9.9531269e-01 4.3218434e-03 1.1840275e-04 9.4642470e-05
  1.4596157e-04]]
Index dự đoán =  1
Giá trị thật =  [0. 1. 0. 0. 0. 0.]
Đúng/sai =  True
Số sim =  [9 0 4 6 7 8 5 7 9]
Kết quả dự đoán =  [[4.0606743e-08 1.1153902e-08 2.9990989e-07 2.5754125e-04 7.6855250e-02
  9.2288691e-01]]
Index dự đoán =  5
Giá trị thật =  [0. 0. 0. 0. 0. 1.]
Đúng/sai =  True
Số sim =  [7 6 3 1 5 5 2 6 6]
Kết quả dự đoán =  [[6.3247199e-09 3.1103348e-06 9.9998784e-01 8.9485120e-06 1.2399668e-07
  6.5671784e-09]]
Index dự đoán =  2
Giá trị thật =  [0. 0. 1. 0. 0. 0.]
Đúng/sai =  True
Số sim =  [3 9 9 5 1 8 8 9 9]
Kết quả dự đoán =  [[3.7746954e-06 2.1891687e-04 4.1706511e-04 1.7380409e-02 1.6484890e-03
  9.8033136e-01]]
Index dự đoán =  