# Imports

In [None]:
import numpy as np 
import pandas as pd
import os, re, math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

import warnings
from sklearn import metrics

from catboost import CatBoostRegressor

import category_encoders as ce
from tqdm.notebook import tqdm

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import albumentations

In [None]:
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 10         # make default plot size
%config InlineBackend.figure_format = 'svg' # make default plot format svg
%matplotlib inline
import seaborn as sns

# Data loads

In [None]:
# ---- Path for Kaggle ---
PATH = '../input/sf-dst-car-price-prediction-part2/'
# ---- Path for local ----
# PATH=''

data = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
sample_submission = pd.read_csv(PATH + 'sample_submission.csv')
pd.set_option('display.max_columns', None)

In [None]:
SEED=42
np.random.seed(SEED)
TARGET = 'price'

# Function
EDA, FE, etc.

In [None]:
def find_number(field: str):
    """
    function to take only first numeric data from string
    :param field: string to find number
    :return: number (in string format)
    """

    # mask for search
    p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'

    # main cycle for searching
    if re.search(p, field) is not None:
        for catch in re.finditer(p, field):
            return catch[0]

In [None]:
def name_separate(item):
    """
    function to separete text value in vector 'name' and return only first part
    :param item: text value from field of vector
    :return: first part of text value like '180 BlueEFFICIENCY 1.8 AT'
    """
    result = item.replace('(','-').replace(')','-').split('-')
    if len(result)==3:
        third = result[2]
    else:
        third = ''

    return result[0]#, result[1], third

In [None]:
def mape(y_true, y_pred):
    """
    function to calculate Mean Absolute Percentage Error
    """
    return np.mean(np.abs((y_pred-y_true)/y_true))*100

In [None]:
results = pd.DataFrame()
def feature_imp(dataset, regressor, encoder, test_drop = True, target='price'):
    """
    function for checking feature importance on ML
    :param dataset: dataset for ML
    :param regressor: algorithm of ML
    :param encoder: encoder for categorical vectors
    :param test_drop: boolean flag whether need to separate test data from dataset
    :param target: name of target
    :return: list of features with weights descending, score of algorithm and MAPE score
    """
    global results

    # copy dataset useful if you need to drop some vectors from dataset
    dataset=dataset.copy()

    # checkin boolean flag to drop test data from dataset
    if test_drop:
        dataset = dataset[dataset.test == 0]

    # drop vector 'test'
    # dataset.drop(['test'], axis=1, inplace=True)

    # make variables x and y for ML model
    x, y = dataset.drop(target, axis=1), np.log(dataset[target])

    # encoding categorical vectors with encoder
    encoder.fit(x, y)
    X = encoder.transform(x)

    # split for validate and fit ML model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)
    regressor.fit(X_train, y_train)

    # make dataframe with features importance
    importance = regressor.feature_importances_
    value = pd.DataFrame(data=importance, index=X.columns, columns=['importance'])
    print(value.importance.sort_values(ascending=False))

    # calculate ML model score
    score = regressor.score(X_test, y_test)
    print(f'Score of regressor {score*100:0.2f}%')

    # calculate MAPE score
    predict = np.exp(regressor.predict(X_test))
    print(f"MAPE: {(mape(np.exp(y_test), predict)):0.2f}%")

    if not results.empty:
        results.drop(columns=results.columns, inplace=True)
    results = X_test.copy()
    results['y_test'] = np.exp(y_test)
    results['predict'] = predict.astype('int64')

### Cooking time for data

In [None]:
def cooking():
    global data, test

    # ---- load datasets ---
    data = pd.read_csv(PATH + 'train.csv')
    test = pd.read_csv(PATH + 'test.csv')
    sample_submission = pd.read_csv(PATH + 'sample_submission.csv')

    # --- bodyType ----
    ## -- repair mistake in data
    data.loc[(data.bodyType == 'хэтчбек 3 дв.') & (data.numberOfDoors == 2), 'numberOfDoors'] = 3

    ## -- working
    n_doors = {
        'внедорожник 3 дв.': 'внедорожник',
        'внедорожник 5 дв.': 'внедорожник',
        'внедорожник открытый': 'внедорожник открытый',
        'кабриолет': 'кабриолет',
        'компактвэн': 'компактвэн',
        'купе': 'купе',
        'купе-хардтоп': "купе-хардтоп",
        'лимузин': 'лимузин',
        'лифтбек': 'лифтбек',
        'минивэн': 'минивэн',
        'пикап двойная кабина': 'пикап двойная кабина',
        'родстер': 'родстер',
        'седан': 'седан',
        'седан 2 дв.': 'седан',
        'универсал 5 дв.': 'универсал',
        'хэтчбек 3 дв.': 'хэтчбек',
        'хэтчбек 5 дв.': 'хэтчбек'
    } # dic of clear body types
    data['body'] = data.bodyType.map(n_doors)
    test['body'] = test.bodyType.map(n_doors)
    data['bodyDoors'] = data.apply(lambda q: str(q.body) + " " + str(q.numberOfDoors), axis=1)
    test['bodyDoors'] = test.apply(lambda q: str(q.body) + " " + str(q.numberOfDoors), axis=1)

    # --- engineDisplacement ---
    data['engineDisplacementValue'] = data.engineDisplacement.apply(find_number)
    test['engineDisplacementValue'] = test.engineDisplacement.apply(find_number)
    data.engineDisplacementValue.fillna(value=0.0001, inplace=True)
    test.engineDisplacementValue.fillna(value=0.0001, inplace=True)
    data.engineDisplacementValue = data.engineDisplacementValue.apply(float)
    test.engineDisplacementValue = test.engineDisplacementValue.apply(float)
    scaler = MinMaxScaler()
    data['edvMinMax'] = scaler.fit_transform(data[['engineDisplacementValue']])
    test['edvMinMax'] = scaler.transform(test[['engineDisplacementValue']])
    
    data['displacement']=data.engineDisplacementValue//1
    test['displacement']=test.engineDisplacementValue//1
    
    # --- enginePower ---
    data['enginePowerValue'] = data.enginePower.apply(find_number).apply(int)
    test['enginePowerValue'] = test.enginePower.apply(find_number).apply(int)

    data['epvLog'] = data.enginePowerValue.apply(np.log)
    test['epvLog'] = test.enginePowerValue.apply(np.log)

    
    # --- productionDate ---
    # till 3 years - free technical support 
    # till 5 years - guarantee period 
    # after 10 years - vehicle loose up to half cost 
    # after 30 years - it is just rare 
    age = lambda date: 3 if date >= 2018 \
        else 5 if date >=2016 else 10 if date >= 2011 \
        else 20 if date>=2001 else 30

    data['age']=data.productionDate.apply(age)
    test['age']=test.productionDate.apply(age)    
    
    
    # --- modelDate ---
    data['modelDateLog']=data.modelDate.apply(lambda q: np.log(2022-q))
    test['modelDateLog']=test.modelDate.apply(lambda q: np.log(2022-q))
    scaler = MinMaxScaler()
    data['modelDateMM'] = scaler.fit_transform(data[['modelDate']])
    test['modelDateMM'] = scaler.transform(test[['modelDate']])

    # --- owners ---
    data['Владельцы'].fillna('2 владельца', inplace=True)
    data['owners'] = data['Владельцы'].apply(find_number)
    data['ownerValue'] = data.apply(lambda q: int(q.owners), axis=1)
    data['Владение'].fillna('Не известно', inplace=True)

    test['Владельцы'].fillna('2 владельца', inplace=True)
    test['owners']=test['Владельцы'].apply(find_number)
    test['ownerValue'] = test.apply(lambda q: int(q.owners), axis=1)
    test['Владение'].fillna('Не известно', inplace=True)

    # --- delete noise data ---
    ejection={
        'color': {'розовый'},
        'engineDisplacement': {'3.4 LTR', '4.6 LTR', '4.9 LTR', '5.6 LTR', '6.3 LTR'},
        'sell_id': {1100083262},
        'Руль': {'Правый'},
        'bodyType': {'седан 2 дв.', 'компактвэн', 'лимузин'}
    }
    for vector, values in ejection.items():
        for value in values:
            data = data[data[vector] != value]

    # ---name---
    data['name1'] = data.name.apply(name_separate)
    data['vehicle'] = data.brand + ' ' + data.model_info + ' ' + data.vehicleConfiguration + ' ' + data.name1

    test['name1'] = test.name.apply(name_separate)
    test['vehicle'] = test.brand + ' ' + test.model_info + ' ' + test.vehicleConfiguration + ' ' + test.name1

    # --- description ---
    data['descriptionLen'] = data.description.apply(lambda q: len(q.split()))
    test['descriptionLen'] = test.description.apply(lambda q: len(q.split()))

cooking()

In [None]:
data.drop(['price', 'description'], axis=1).info()

In [None]:
list_to_drop = ['engineDisplacementValue','modelDate','modelDateMM',
                'Руль',
                'enginePowerValue',
                'description',
                ]
parameters = {
    'regressor': CatBoostRegressor(random_state=SEED),
    'encoder': ce.TargetEncoder()
}
feature_imp(data.drop(list_to_drop, axis=1), test_drop=False, **parameters)

# Catboost submission

In [None]:
x, y = data.drop([TARGET], axis=1), np.log(data[TARGET])
q = test

In [None]:
encoder=ce.TargetEncoder()
encoder.fit(x, y)
x = encoder.transform(x)
q = encoder.transform(q)

In [None]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          #silent=True,
                          )
model.fit(x, np.log(y),
          #cat_features=cat_features_ids,
          #eval_set=(x, np.log(y_test)),
          verbose_eval=0,
          #use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_EDA1.model')

q['price'] = np.exp(model.predict(q))

In [None]:
catboost_prediction = q[['sell_id', 'price']]
catboost_prediction.to_csv('catboost_sub.csv', index=False)
catboost_prediction
# 11.6153% on leaderboard

In [None]:
list_to_drop = ['engineDisplacementValue','modelDate','modelDateMM',
                'Руль',
                'enginePowerValue',
                'description',
                ]
parameters = {
    'regressor': CatBoostRegressor(random_state=SEED),
    'encoder': ce.TargetEncoder()
}
feature_imp(data.drop(list_to_drop, axis=1), test_drop=False, **parameters)

In [None]:
list_to_nn = [
    'name', 'mileage', 'productionDate', 'vehicle', 'modelDateLog', 'model_info', 'name1', 'age', 'epvLog', 
    'edvMinMax', 
    'enginePower', 'engineDisplacement',
    #'descriptionLen', 'vehicleConfiguration'
]

# Tabular
Make simple neuro net

In [None]:
x, y = data.drop([TARGET], axis=1), data[TARGET]/1000000

# x, y = data[list_to_nn], data[TARGET]
# I try to drop some vectors, whch have small feature impotance. But it didn't improve result

# encode cathegorial data
encoder=ce.TargetEncoder()
encoder.fit(x, y)
x = encoder.transform(x)

# standardize numeric data to best fit for neuro-net
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

# split data for train and test parts
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=True, random_state=SEED)

In [None]:
# encode and standardized data in test set
q = test
#q = test[list_to_nn]   
q = encoder.transform(q)
q = scaler.transform(q)

In [None]:
# my first attempt 
model_tab = Sequential()
model_tab.add(L.Dense(2048, input_dim=x_train.shape[1], activation="sigmoid"))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(1024, activation="relu"))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(256, activation="relu"))
model_tab.add(L.Dropout(0.25))
model_tab.add(L.Dense(1, activation="linear"))
# 12.63

In [None]:
# my second attempt
model_tab = Sequential()
model_tab.add(L.Dense(2048, input_dim=x_train.shape[1], activation="relu"))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(256, activation="relu"))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(2048, activation="relu")) 
model_tab.add(L.Dropout(0.25))
model_tab.add(L.Dense(256, activation="relu"))  
model_tab.add(L.Dropout(0.25))
model_tab.add(L.Dense(1024, activation="relu"))  
model_tab.add(L.Dropout(0.25))
model_tab.add(L.Dense(256, activation="relu"))  
#model_tab.add(L.Dropout(0.25))
model_tab.add(L.Dense(1, activation="linear"))
# 12.47

In [None]:
# My third attempt. I thit it is my best.
model_tab = Sequential()
model_tab.add(L.Dense(256, input_dim=x_train.shape[1], activation="sigmoid"))
model_tab.add(L.BatchNormalization(axis=1))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(256, activation="relu"))
model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(256, activation="relu"))
model_tab.add(L.BatchNormalization(axis=1))
#model_tab.add(L.Dropout(0.5))
model_tab.add(L.Dense(1, activation="linear"))

## Step behind
After experimenting with a tabular and NLP network, I decided to apply the "feature forwarding" technique, since we have one feature vector with a very strong correlation with the target vector, and two more with good. So, I add branche with "linear" activations and two branches with "sigmoid" activations. This is improve results on validation (but not in leaderboard :/...  ).

In [None]:
# tab1 
tab_start = tf.keras.Input(shape=x_train.shape[1])

# main flow of nn-process
tab_flow1 = Sequential()
tab_flow1.add(L.Dense(256, activation="sigmoid"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.25))
tab_flow1.add(L.Dense(1, activation="sigmoid"))

# main "forvarding feature"
tab_flow2 = L.Dense(1, activation="linear")

# two 'forvarding features with good impotance'
tab_flow3 = L.Dense(1, activation="sigmoid")
tab_flow4 = L.Dense(1, activation="sigmoid")

# folding all flows of data processing
tab_result = tab_flow1(tab_start) + tab_flow2(tab_start) + tab_flow3(tab_start) + tab_flow4(tab_start)

# making model from folded flows
model_tab = Model(inputs=tab_start, outputs=tab_result, name='Tabular')


In [None]:
# tab2. 
# It was trying to avoide overfitting. Faild
tab_start = tf.keras.Input(shape=x_train.shape[1])

tab_flow1 = Sequential()
tab_flow1.add(L.Dense(512, activation="sigmoid"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(512, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(512, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.25))
tab_flow1.add(L.Dense(1, activation="sigmoid"))

tab_flow2 = L.Dense(1, activation="linear")

tab_flow3 = L.Dense(1, activation="sigmoid")
tab_flow4 = L.Dense(1, activation="sigmoid")

tab_result = tab_flow1(tab_start) + tab_flow2(tab_start) + tab_flow3(tab_start) + tab_flow4(tab_start)

model_tab = Model(inputs=tab_start, outputs=tab_result, name='Tabular')


# Real forvarding feature

In [None]:
data.drop(['price', 'description'], axis=1).info()

In [None]:
ind_best = 9  # vector 'name'
ind_1 = 6     # vector 'mileage'
ind_2 = 11    # vcnor

# tab1 
# forwarding exect features

tab_start = tf.keras.Input(shape=x_train.shape[1])

# main flow of nn-process
tab_flow1 = Sequential()
tab_flow1.add(L.Dense(256, activation="sigmoid"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.25))
tab_flow1.add(L.Dense(1, activation="sigmoid"))

# main "forvarding feature"
tab_flow2 = tf.gather(tab_start, indices = [ind_best], axis=1)
tab_flow2 = L.Embedding(2,20)
tab_flow2 = L.Dense(1, activation="linear")

# two 'forvarding features with good impotance'
#tab_flow3 = tf.gather(tab_start, indices = [ind_1], axis=1)
#tab_flow3 = L.Embedding(2,20)
#tab_flow3 = L.Dense(1, activation="sigmoid")

#tab_flow4 = tf.gather(tab_start, indices = [ind_2], axis=1)
#tab_flow4 = L.Embedding(2,20)
#tab_flow4 = L.Dense(1, activation="sigmoid")

# folding all flows of data processing
tab_result = tab_flow1(tab_start) + tab_flow2(tab_start)# + tab_flow3(tab_start) + tab_flow4(tab_start)

# making model from folded flows
model_tab = Model(inputs=tab_start, outputs=tab_result, name='Tabular')

"""
В итоге данная концигурация проброса признака не принесла существенных улучшений в метрике.
"""

In [None]:
model_tab.load_weights('../input/tab1hdf5/tab1.hdf5')

In [None]:
model_tab.summary()

In [None]:
# Compile model
optimizer = tf.keras.optimizers.Adam(0.01)
model_tab.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])

In [None]:
checkpoint = ModelCheckpoint('../working/best_model.hdf5' , monitor=['val_MAPE'], verbose=0  , mode='min')
earlystop = EarlyStopping(monitor='val_MAPE', patience=100, restore_best_weights=True,)
callbacks_list = [checkpoint, earlystop]

In [None]:
history_tab = model_tab.fit(x_train, y_train,
                    batch_size=128,
                    epochs=1000,                       # to the fact, we wil wait early stop
                    validation_data=(x_test, y_test),
                    callbacks=callbacks_list,
                    verbose=0,
                   )

In [None]:
plt.title('Loss')
plt.plot(history_tab.history['MAPE'], label='train')
plt.plot(history_tab.history['val_MAPE'], label='test')
plt.show();

In [None]:
model_tab.load_weights('../working/best_model.hdf5')
model_tab.save('../working/tab1_ff2.hdf5')

In [None]:
tab_prediction = model_tab.predict(x_test)

print(f"TEST mape: {(mape(y_test, tab_prediction[:,0])):0.2f}%")
# 12.47

In [None]:
tab_predict = model_tab.predict(q)*1000000
test['price'] = tab_predict[:,0]

In [None]:
nn1_prediction = test[['sell_id', 'price']]
nn1_prediction.to_csv('tab_sub2.csv', index=False)
nn1_prediction
# 12.45 best on leaderboard

I see there are several problems, which I didnt solve
* overfitting
* not clear data

To avoide overfitting I used Batch- and Drop-out-normalization, chose bigger batch-size. But after several attempts, MAPE of process data being better, instead of validation MAPE.


## Tabular & NLP
* double input neuronet 
* clear text

In [None]:
!pip install pymorphy2
import pymorphy2

In [None]:
# clear text from english words, nombers and symbols and normalize form of word 
morph = pymorphy2.MorphAnalyzer()

patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"

def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)          # data clearance 
    tokens = []
    for token in doc.split():
        token = token.strip()
        token = morph.normal_forms(token)[0]  # normalize form of word 
        tokens.append(token)
    return ' '.join(tokens)

In [None]:
cooking()
data_lem = data.append(test).copy() # concatenate data and train to make clearance

In [None]:
data_lem['description'] = data_lem.description.apply(lemmatize)

In [None]:
# separate to data and test again after clearance
test_lem=data_lem[pd.isna(data_lem.price)]
data_lem=data_lem[data_lem.price >0]
test_lem.drop([TARGET], axis=1, inplace=True)

In [None]:
list_to_drop_nn = ['engineDisplacementValue','modelDate','modelDateMM','Руль','enginePowerValue']

data = data_lem.drop(list_to_drop_nn, axis=1) #lem
test = test_lem.drop(list_to_drop_nn, axis=1) #lem


# split and processing data for NN models
# we need same splitting for NLP-model and tabular-model
# but all pre-processing (encoding and standard scaling) need to make on all data-set
x, y, q = data.drop(TARGET, axis=1), data[TARGET], test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=True, random_state=SEED)

text_train = x_train.description
text_test = x_test.description
text_sub = test.description

x, q = x.drop('description', axis=1), q.drop('description', axis=1)
x_train = x_train.drop('description', axis=1)
x_test = x_test.drop('description', axis=1)

encoder=ce.TargetEncoder().fit(x,y)
x, q = encoder.transform(x), encoder.transform(q)
x_train, x_test = encoder.transform(x_train), encoder.transform(x_test)

scaler = StandardScaler().fit(x)
x_train, x_test, q = scaler.transform(x_train), scaler.transform(x_test), scaler.transform(q)
x = scaler.transform(x)

In [None]:
# TOKENIZER
MAX_WORDS = 100000          # The maximum number of words to be used. (most frequent)
MAX_SEQUENCE_LENGTH = 256   # Max number of words in each complaint.

In [None]:
%%time
tokenize = Tokenizer(num_words=MAX_WORDS)
tokenize.fit_on_texts(data.description)
tokenize.word_index

In [None]:
%%time
text_train_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_train), maxlen=MAX_SEQUENCE_LENGTH)
text_test_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_test), maxlen=MAX_SEQUENCE_LENGTH)
text_sub_sequences = sequence.pad_sequences(tokenize.texts_to_sequences(text_sub), maxlen=MAX_SEQUENCE_LENGTH)

print(text_train_sequences.shape, text_test_sequences.shape, text_sub_sequences.shape, )

In [None]:
#model_tab = Sequential()
#model_tab.add(L.Dense(2048, input_dim=x_train.shape[1], activation="sigmoid"))
#model_tab.add(L.Dropout(0.5))
#model_tab.add(L.Dense(1024, activation="relu"))
#model_tab.add(L.Dropout(0.5))
#model_tab.add(L.Dense(256, activation="relu"))
#odel_tab.add(L.Dropout(0.25))
#model_tab.add(L.Dense(1, activation="linear"))
#12.63

In [None]:
#model_tab = Sequential()
#model_tab.add(L.Dense(2048, input_dim=x_train.shape[1], activation="relu"))
#model_tab.add(L.Dropout(0.5))
#model_tab.add(L.Dense(256, activation="relu"))
#model_tab.add(L.Dropout(0.5))
#odel_tab.add(L.Dense(2048, activation="relu")) 
#model_tab.add(L.Dropout(0.25))
#odel_tab.add(L.Dense(256, activation="relu"))  
#odel_tab.add(L.Dropout(0.25))
#model_tab.add(L.Dense(1024, activation="relu"))  
#model_tab.add(L.Dropout(0.25))
#model_tab.add(L.Dense(256, activation="relu"))  
#model_tab.add(L.Dropout(0.25))
#model_tab.add(L.Dense(1, activation="linear"))
# 12.47

In [None]:
# tab1 , without top
tab_start = tf.keras.Input(shape=x_train.shape[1])

tab_flow1 = Sequential()
tab_flow1.add(L.Dense(256, activation="sigmoid"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.5))
tab_flow1.add(L.Dense(256, activation="relu"))
tab_flow1.add(L.BatchNormalization(axis=1))
tab_flow1.add(L.Dropout(0.25))
#tab_flow1.add(L.Dense(1, activation="sigmoid"))

tab_flow2 = L.Dense(1, activation="linear")

tab_flow3 = L.Dense(1, activation="sigmoid")
tab_flow4 = L.Dense(1, activation="sigmoid")

tab_result = tab_flow1(tab_start) + tab_flow2(tab_start) + tab_flow3(tab_start) + tab_flow4(tab_start)

model_tab = Model(inputs=tab_start, outputs=tab_result, name='Tabular')


In [None]:
model_nlp = Sequential()
model_nlp.add(L.Input(shape=MAX_SEQUENCE_LENGTH, name="seq_description"))
model_nlp.add(L.Embedding(len(tokenize.word_index)+1, MAX_SEQUENCE_LENGTH,))
model_nlp.add(L.LSTM(256, return_sequences=True))
model_nlp.add(L.Dropout(0.5))
model_nlp.add(L.Dense(128, activation="sigmoid"))
model_nlp.add(L.Dropout(0.5))
model_nlp.add(L.LSTM(128,))
model_nlp.add(L.Dropout(0.25))
model_nlp.add(L.Dense(64, activation="relu"))
model_nlp.add(L.Dropout(0.25))

In [None]:
doubleInput = L.concatenate([model_nlp.output, model_tab.output])
funnel = L.Dense(64, activation="relu")(doubleInput)
funnel = L.Dense(1, activation="linear")(funnel)

model = Model(inputs=[model_nlp.input, model_tab.input], outputs=funnel)

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
model.compile(loss='MAPE',optimizer=optimizer, metrics=['MAPE'])
checkpoint = ModelCheckpoint('../working/best_model.hdf5', monitor=['val_MAPE'], verbose=0, mode='min')
earlystop = EarlyStopping(monitor='val_MAPE', patience=90, restore_best_weights=True,)
callbacks_list = [checkpoint, earlystop]

In [None]:
#model.load_weights('../input/nn-tab-nlp/nn_mlp_nlp (1).hdf5')

In [None]:
history_multy = model.fit([text_train_sequences, x_train], y_train,
                    batch_size=256,
                    epochs=500, # фактически мы обучаем пока EarlyStopping не остановит обучение
                    validation_data=([text_test_sequences, x_test], y_test),
                    callbacks=callbacks_list
                   )

In [None]:
plt.title('Loss')
plt.plot(history_multy.history['MAPE'], label='train')
plt.plot(history_multy.history['val_MAPE'], label='test')
plt.show()

In [None]:
model.load_weights('../working/best_model.hdf5')
model.save('../working/nn2_nlp_tab.hdf5')

In [None]:
#model.load_weights('../input/nn-tab-nlp/nn_mlp_nlp (1).hdf5')

In [None]:
test_predict_nn2 = model.predict([text_test_sequences, x_test])
print(f"TEST mape: {(mape(y_test, test_predict_nn2[:,0])):0.2f}%")

In [None]:
sub_predict_nn2 = model.predict([text_sub_sequences, q])
test['price'] = sub_predict_nn2[:,0]

nn2_prediction = test[['sell_id', 'price']]
nn2_prediction.to_csv('nn2_sub.csv', index=False)
nn2_prediction
#13.99 (first attempt)
#13.88 (list to drop)
#12.12 (Tabular2 + NLP)

# Resume

DoubleInput improve MAPE only a little, and it is still worse of Catboost ML-model. 

Only one way to go better is to work more on text in vector description. 