In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf 
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import BatchNormalization, Dense, Embedding, Input, Concatenate, Flatten, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import one_hot
from tensorflow.keras.utils import plot_model

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,0,40.10891,-83.09286,8336,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0
1,1,39.86542,-84.0628,18403,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1
2,2,39.10266,-84.52468,14022,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0
3,3,39.10148,-84.52341,11051,0,0,209049.99746,0.0,95.340075,103267.727546,1
4,4,41.06213,-81.53784,3243,0,3,8669.269507,0.0,399.421926,177532.206618,1


In [3]:
def transform_df(df):
    df = df.drop(columns=df.columns[0])
    mask = df['company'].value_counts()
    df['company'] = np.where(df['company'].isin(mask.index[mask>=10]), df.company, -1)
    return df

In [4]:
%%time
train_df = transform_df(train_df)
test_df = transform_df(test_df)

Wall time: 42.9 ms


In [5]:
km = KMeans(n_clusters=300, random_state=42)
train_df['geo'] = km.fit_predict(train_df[['latitude', 'longitude']])
test_df['geo'] = km.predict(test_df[['latitude', 'longitude']])

In [6]:
le = LabelEncoder()
le.fit(train_df.company)
train_df['company'] = le.transform(train_df.company)

In [7]:
test_df['company'] = np.where(test_df['company'].isin(train_df['company'].unique()), test_df['company'], -1)
test_df['company'] = le.transform(test_df.company)

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop('target', axis=1), 
                                                      train_df.target, test_size=0.1, random_state=42)

In [9]:
numeric_cols = ['fin_1', 'fin_2', 'fin_3', 'fin_4', 'is_local']

In [10]:
def build_model(numeric_data, company_data, type_data, geo_data, num_company):
    numeric_input = Input(shape=(5,), dtype=tf.float32, name="numeric_data")
    numeric = Dense(32, activation='relu')(numeric_input)
    numeric = BatchNormalization()(numeric)
    numeric = Dense(32, activation='relu')(numeric)
    numeric = BatchNormalization()(numeric)
    numeric = Reshape((1, 32))(numeric)

    
    company_input = Input(shape=(1,), dtype=tf.float32, name="company_data")
    company = Embedding(num_company + 1, 64)(company_input)
    company = Dense(32, activation='relu')(company)
    # company = Reshape((-1, 32))(company)

    type_l_input = Input(shape=(1,), dtype=tf.float32, name="type_data")
    type_l = Embedding(6, 8)(type_l_input)
    # type_l = Reshape((-1, 8))(type_l)

    geo_input = Input(shape=(1,), dtype=tf.float32, name="geo_data")
    geo = Embedding(300, 32)(geo_input)
    geo = Dense(32, activation='relu')(geo)
    # geo = Reshape((-1, 32))(geo)

    
    out = Concatenate()([numeric, company, type_l, geo])
    out = Dense(64, activation='relu')(out)
    out = BatchNormalization()(out)
    out = Dense(64, activation='relu')(out)
    out = BatchNormalization()(out)
    out = Dense(1, activation="sigmoid")(out)
    
    model = Model(inputs=[numeric_input, company_input, type_l_input, geo_input], outputs=out)
    model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = build_model(train_df[numeric_cols].values, train_df.company.values, 
                    train_df.type.values, train_df.geo.values, train_df.company.max())

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
numeric_data (InputLayer)       [(None, 5)]          0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 32)           192         numeric_data[0][0]               
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 32)           128         dense[0][0]                      
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 32)           1056        batch_normalization[0][0]        
______________________________________________________________________________________________

In [12]:
plot_model(model, to_file='model.png')

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [13]:
BATCH_SIZE = 128

train_dataset = (
    tf.data.Dataset
    .from_tensor_slices(({"numeric_data": X_train[numeric_cols].values, 
                          "company_data": X_train.company.values, 
                          "type_data": X_train.type.values, 
                          "geo_data": X_train.geo.values}, y_train))
    .shuffle(2048)
    .batch(BATCH_SIZE)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices(({"numeric_data": X_valid[numeric_cols].values, 
                          "company_data": X_valid.company.values, 
                          "type_data": X_valid.type.values, 
                          "geo_data": X_valid.geo.values}, y_valid))
    .batch(BATCH_SIZE)

)

In [14]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='.',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [15]:
%%time
n_steps = X_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=10, 
    callbacks=[model_checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 37.2 s


In [16]:
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(({"numeric_data": test_df[numeric_cols].values, 
                          "company_data": test_df.company.values, 
                          "type_data": test_df.type.values, 
                          "geo_data": test_df.geo.values}))
    .batch(BATCH_SIZE)
)
predictions = model.predict(test_dataset, verbose=1).argmax(axis=1)



In [17]:
accuracy_score(test_df.target.values, predictions.reshape(-1))

0.40665