In [1]:
# This code cell is to get rid of annoying tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import pandas as pd
import tensorflow as tf

In [3]:
train_val_df = pd.read_csv("train.csv")

In [4]:
test_df = pd.read_csv("test.csv")

In [5]:
train_val_df.head(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387


In [6]:
cuts = train_val_df.cut.unique(); cuts_dict = dict(zip(cuts, range(len(cuts))))
color = train_val_df.color.unique(); colors_dict = dict(zip(color, range(len(color))))
clarity = train_val_df.clarity.unique(); claritys_dict = dict(zip(clarity, range(len(clarity))))

In [7]:
train_val_df = train_val_df.applymap(lambda s: cuts_dict.get(s) if s in cuts_dict else s)
train_val_df = train_val_df.applymap(lambda s: colors_dict.get(s) if s in colors_dict else s)
train_val_df = train_val_df.applymap(lambda s: claritys_dict.get(s) if s in claritys_dict else s)

In [8]:
test_df = test_df.applymap(lambda s: cuts_dict.get(s) if s in cuts_dict else s)
test_df = test_df.applymap(lambda s: colors_dict.get(s) if s in colors_dict else s)
test_df = test_df.applymap(lambda s: claritys_dict.get(s) if s in claritys_dict else s)

In [9]:
train_val_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,0,0,0,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,1,1,1,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,2,2,2,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,2,2,2,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,0,2,0,62.6,59.0,7.65,7.61,4.77,14453


In [10]:
y_train_val = train_val_df.price.to_numpy()

In [11]:
X_train_val = train_val_df.drop(["id", "price"], axis=1)

In [12]:
X_test = test_df.drop(["id"], axis=1)

In [13]:
X_train_val.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,0,0,0,62.2,58.0,7.27,7.33,4.55
1,2.03,1,1,1,62.0,58.0,8.06,8.12,5.05
2,0.7,2,2,2,61.2,57.0,5.69,5.73,3.5
3,0.32,2,2,2,61.6,56.0,4.38,4.41,2.71
4,1.7,0,2,0,62.6,59.0,7.65,7.61,4.77


In [14]:
highest_per_col = [max(X_train_val[col]) for col in X_train_val.columns]

In [15]:
highest_per_col

[3.5, 4, 6, 7, 71.6, 79.0, 9.65, 10.01, 31.3]

In [16]:
X_train_val = X_train_val / highest_per_col

In [17]:
X_test = X_test / highest_per_col

In [18]:
X_train_val.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.434286,0.0,0.0,0.0,0.868715,0.734177,0.753368,0.732268,0.145367
1,0.58,0.25,0.166667,0.142857,0.865922,0.734177,0.835233,0.811189,0.161342


In [19]:
X_test.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.1,0.5,0.666667,0.0,0.870112,0.708861,0.467358,0.453546,0.090096
1,0.22,0.25,0.0,0.142857,0.877095,0.708861,0.604145,0.586414,0.117572


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.10, random_state=42)

In [22]:
X_train.shape, X_test.shape, X_val.shape

((174215, 9), (129050, 9), (19358, 9))

In [23]:
y_train.shape, y_val.shape

((174215,), (19358,))

In [24]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(1024, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(512, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [25]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=64, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3e3f309e80>

In [26]:
ans = model.predict(X_test)
ans = ans.reshape(-1)
ans_df = pd.DataFrame({"id": test_df.id, "price": ans})
ans_df.to_csv("outputs/ans2.csv", index=False)

