# Imports

In [None]:
#imports

import os 
import math
import numpy as np
import pandas as pd 
import tensorflow as tf #to define our own metrics

#Keras Word processing
from keras.models import Model, load_model
from keras.layers import Embedding, Dense, Input, Flatten
from keras.layers import Activation, Concatenate
from keras.losses import mse, cosine_proximity #Loss functions
from keras.optimizers import Adam 
from keras.layers import Dropout

from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.utils import to_categorical

#Sklearn to label words
from sklearn.preprocessing import LabelEncoder 

# Matplotlib to plot price and loss
import matplotlib.pyplot as plt

#For text preprocessing 
import re
import string
from collections import Counter 
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

#data visualization
import matplotlib.cm as cm
import missingno as msno
from matplotlib import rcParams
from wordcloud import WordCloud

#Ignore the system warnings
import warnings
warnings.filterwarnings("ignore")

# Loading the dataset

In [None]:
data = pd.read_csv('original.csv Kopie 2.xls')

# Data analysis

# Five top records 

In [None]:
data=data.drop(columns='Unnamed: 0')
data.head()

# Five last records

In [None]:
data.tail()

# Coloumns/features

In [None]:
data.columns

# Length of data

In [None]:
print('lenght of data is', len(data))

# Shape of data

In [None]:
data.shape

# Data information

In [None]:
data.info()

# Data types of all coloumns

In [None]:
data.dtypes

# Looking at Missing values

In [None]:
msno.matrix(data.sample(150930))

# Checking Null Values

In [None]:
data[data.isnull().any(axis=1)].head()

# Count of Null values

In [None]:
import numpy as np
np.sum(data.isnull().any(axis=0))

# Count of all values in Label

In [None]:
y=data['price']
y.value_counts()

# Data Description

In [None]:
data.describe()

# Top 10 prices 

In [None]:
data["price"].value_counts().head(10).plot(kind = 'pie', autopct='%1.1f%%', figsize=(8, 8)).legend()

# Numeric features distribution 

In [None]:
data.hist(figsize=(20,20),bins = 20, color="#107009AA")
plt.title("Price")
plt.show()

# Data Preparation

# Preparation of description

In [None]:
data['description']=data['description'].str.lower()

In [None]:
data['description'].head()

# Preprocess data on Descripcion

In [None]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations + english_punctuations

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# Remove_repeating_char

In [None]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

# ProcessPost for applying all functions

In [None]:
def processPost(text): 

    text = re.sub('@[^\s]+', ' ', text)
    

    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)

    text = re.sub(r'#([^\s]+)', r'\1', text)

    text = remove_punctuations(text)
    text = remove_repeating_char(text)
    
    return text

# Applying processPost function for preprocessing

In [None]:
data["description"] = data["description"].apply(lambda x: processPost(x))

# Getting Tokenization

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
data["description"] = data["description"].apply(tokenizer.tokenize)

In [None]:
data["description"].head()

# Stop words 

In [None]:
stopwords_list = stopwords.words('english')

In [None]:
stopwords_list

In [None]:
len(stopwords_list)

In [None]:
data["description"]=data["description"].apply(lambda x: [item for item in x if item not in stopwords_list])

In [None]:
data["description"].head()

# Analysis of description

In [None]:
all_words = [word for tokens in data["description"] for word in tokens]

VOCAB = sorted(list(set(all_words)))

print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))


# Top 25 words in description text

In [None]:
counter = Counter(all_words)

In [None]:
counter.most_common(25)

In [None]:
counted_words = Counter(all_words)

words = []
counts = []
for letter, count in counted_words.most_common(25):
    words.append(letter)
    counts.append(count)

In [None]:
colors = cm.rainbow(np.linspace(0, 1, 10))
rcParams['figure.figsize'] = 20, 10

plt.title('Top words in description Text')
plt.xlabel('Count')
plt.ylabel('Words')
plt.barh(words, counts, color=colors)

# Hyperparameter

In [None]:
#static variables 
learning_rate= 1e-3
batch_size= 128
dataset_file_name = "original.csv Kopie 2.xls"
vocab_size= 10000 #Specifying the words in the network
max_seq_length = 170 #Limit the length of the evaluation text/number of characters. Uniform input sequence 
epochs=500

# Preparation of the data for training with description and price 

In [None]:
# Function for data preparation
def prepare_dataset():
    # Convert CSV content to Panda Data Frame
    data = pd.read_csv(dataset_file_name)
    data=data[['description', 'price']]
    # data shuffeln/mischen
    #data = data.sample(frac=1)
    
    #filter unknown values
    data = data[pd.notnull(data["price"])] #drop unknown values

    # split
    train_size=int(len(data)*0.7) #70% of the data are training data
    validation_size=int(len(data)*0.15) 
    test_size=int(len(data)*0.15) 
    
    # prepare features and labels
    # train
    x_train_description = data['description'][:train_size]
    y_train_price = data['price'][:train_size]

    # validation
    x_val_description = data['description'][train_size:train_size+validation_size] #[0,7-0,85]
    y_val_price = data['price'][train_size:train_size+validation_size]

    # test
    x_test_description = data['description'][train_size+validation_size:]
    y_test_price = data['price'][train_size+validation_size:]

    # Create a tokenizer so that the words in the description are trained
    tokenize = Tokenizer(num_words=vocab_size, char_level=False) #Depth 
    tokenize.fit_on_texts(x_train_description) # we only train using the TrainSet, because it is the biggest


    # create feature from embeddings (generation of vectors)
    x_train_embed = tokenize.texts_to_sequences(x_train_description)  
    x_val_embed = tokenize.texts_to_sequences(x_val_description)  
    x_test_embed = tokenize.texts_to_sequences(x_test_description)

    # Normalize the sequences (Because of different length of the vectors)
    x_train_embed = pad_sequences(x_train_embed, maxlen=max_seq_length, padding="post")  
    x_val_embed = pad_sequences(x_val_embed, maxlen=max_seq_length, padding="post")  
    x_test_embed = pad_sequences(x_test_embed, maxlen=max_seq_length, padding="post")  

    # shapes of the sets
    print("Train size: ", x_train_description.shape, y_train_price.shape)
    print("Validation size: ", x_val_description.shape, y_val_price.shape)
    print("Test size: ", x_test_description.shape, y_test_price.shape)


    return (
        (x_train_description, x_train_embed, y_train_price), 
        (x_val_description, x_val_embed, y_val_price), 
        (x_test_description, x_test_embed, y_test_price)
    )
    

# Model - Training & Test

# Get_predictions for getting the predictions of price

In [None]:
#Load Model
def get_predictions():
    model = load_model('wine_ann_model.npz', custom_objects={'price_difference': price_difference})
    
    # Dataset loading
    (
        (x_train_description, x_train_embed, y_train_price), 
        (x_val_description, x_val_embed, y_val_price), 
        (x_test_description,  x_test_embed, y_test_price)
    ) = prepare_dataset()

    
    max_number_prediction = len(y_test_price)
    diff = 0

    predictions = model.predict({
        'input_description': x_train_description,
        'input_description_pad': x_test_embed
    })

    print(predictions)

    for i in range(max_number_prediction):
        actual_price = y_test_price.iloc[i]
        price_predicted = predictions[i][0]
        description = x_test_description.iloc[i]
        
        diff += np.abs(price_predicted - actual_price)

        print(description)
        print("Predicted Price: ", price_predicted, "; Actual Price: ", actual_price)

    print("Allgemeine differenz zwischen prediction und laben des Testset ist: ", diff)

# Price_difference function 

In [None]:
def price_difference(y_true, y_pred):
    x = tf.expand_dims(y_true, -1)
    y = tf.expand_dims(y_pred, -1)
    diff = tf.subtract(x, y)
    diff = tf.abs(diff)
    diff = tf.reduce_sum(diff)
    diff = tf.reduce_mean(diff)
    return diff
    

# Deep Model

In [None]:
def build_deep_model():
    #Input
    
    input_layer = Input(shape=(max_seq_length,), name="input_description_pad")
    
    #Layer
    x = Embedding(vocab_size, 8, input_length=max_seq_length)(input_layer)
    x = Flatten()(x)
    x = Dense(1024)(x)
    x = Activation('relu')(x)
    x = Dense(512)(x)
    x = Activation('relu')(x)
    x = Dense(256)(x)
    x = Activation('relu')(x)
    x = Dense(64)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    out = Dense(1)(x)

    model = Model(inputs=input_layer, outputs=out)

    return model


# train function for the training of deep model 

In [None]:
#trainer
def train():
    (
        (x_train_description, x_train_embed, y_train_price), 
        (x_val_description,  x_val_embed, y_val_price), 
        (x_test_description, x_test_embed, y_test_price)
    ) = prepare_dataset()
    

    
    #Model definition 
    deep_model = build_deep_model()
    model_ = Dense(1, name="price")(deep_model.output)

    inputs = deep_model.input
    combined_model = Model(inputs=inputs, outputs=model_)
    
    #Compiling
    opt = Adam(lr=learning_rate)
    combined_model.compile(loss=mse, optimizer=opt, metrics=['accuracy', price_difference])
    combined_model.summary()
    

    
    #Plot
    history = combined_model.fit(x={
        'input_description': x_train_description,
        'input_description_pad': x_train_embed
    }, y={
        'price': y_train_price
    },  epochs=epochs, 
        batch_size=batch_size,
        verbose=1,
        validation_data=({
            'input_description': x_train_description,
            'input_description_pad': x_val_embed
        }, y_val_price)
    )

    #training  
    loss_training = history.history['loss']
    loss_val = history.history['val_loss']
    pricediff_training = history.history['price_difference']
    pricediff_val = history.history['val_price_difference']

    # plotte ergebnisse
    epochs_ = range(epochs)
    plt.plot(epochs_, loss_training, label="training loss")
    plt.plot(epochs_, loss_val, label="validation loss")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.legend()
    plt.show()

    plt.plot(epochs_, pricediff_training, label="training price difference")
    plt.plot(epochs_, pricediff_val, label="validation price difference")
    plt.xlabel("epochs")
    plt.ylabel("price")
    plt.legend()
    plt.show()

    combined_model.save('wine_ann_model.npz')

# Calling the Train function and Training the deep model

In [None]:
#If Model is not saved, than traing 
def main():
    train()
          
if __name__ == "__main__":
    main()


# Calling Trained model, get_predictions function and getting Predictions with trained model of Deep model

In [None]:
#If model is not saved, then train
def main():
    if os.path.exists('wine_ann_model.npz'):
        get_predictions()
    else:
        print('There is no trained model')
    

        epo
if __name__ == "__main__":
    main()
