In [13]:
import pandas as pd
import scrapy
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
train = pd.read_csv('data/input.csv')
validation = pd.read_csv('data/output.csv')

# good article on
# https://medium.com/@mikeyo4800/how-to-build-a-multi-label-text-classification-model-using-nlp-and-machine-learning-2e05f72aad5f

facility_enum = open('data/facility_enum.txt', 'r')
Lines = facility_enum.readlines()
facility_enum.close()

# provide new dataframe with edited tags
filled_train = []
# find name according to code number
code_name_enumeration = {}

for line in Lines:
    code_split_name = list(line.split('\t'))
    code_split_name[1] = code_split_name[1][:-1]
    code_name_enumeration[code_split_name[0]] = code_split_name[1]


In [4]:
for index, row in validation.iterrows():
    validation.at[index, 'facility_code'] = code_name_enumeration[row['facility_code']].lower().strip()
validation.rename(columns={'facility_code': 'facility'})

for index, row in train.iterrows():
    hotel_id = row.hotel_id
    text = row.content
    if type(text) is not str:
        # print(f'There is not any provided content for {hotel_id} Hotel')
        continue
    items = scrapy.Selector(text=text).css('.hotel-description-content::text').extract()
    # location = items[0]
    # facilities = items[1:]
    facilities = items
    merged_facilities = ''
    for facility in facilities:
        merged_facilities += facility + ' '

    merged_facilities.lower().strip()
    prepare_new_row = [hotel_id, merged_facilities]
    filled_train.append(prepare_new_row)

filled_train = pd.DataFrame(filled_train, columns=['hotel_id', 'facilities'])
# print(validation)
# print(filled_train)

facility_codes = validation['facility_code'].tolist()
unique_facility_codes = set(facility_codes)
hotel_ids = filled_train['hotel_id'].tolist()
unique_hotel_ids = set(hotel_ids)



In [5]:
features_of_hotel = []

for hotel_id in unique_hotel_ids:
    hotel = validation[validation.hotel_id == hotel_id]
    features = hotel['facility_code'].tolist()
    features_of_hotel.append([hotel_id, features])

features_df = pd.DataFrame(features_of_hotel, columns=['hotel_id', 'features'])
# print(features_df)
# print(filled_train)

number_of_occurrence = {}
for index, row in features_df.iterrows():
    features_line = row['features']
    for feature in features_line:
        if number_of_occurrence.get(feature):
            number_of_occurrence[feature] += 1
        else:
            number_of_occurrence[feature] = 1

# Creating histogram
# fig, ax = plt.subplots(figsize=(10, 7))
# ax.hist(number_of_occurrence.values())
# plt.show()
# print(number_of_occurrence)

number_of_occurrence_filtered = {}
for key, value in number_of_occurrence.items():
    if value > 10:
        number_of_occurrence_filtered[key] = value


In [6]:

# print(len(number_of_occurrence))
# print(len(number_of_occurrence_filtered))
# result => from 231 to 144 features
selected_features = number_of_occurrence_filtered.keys()
for index, row in features_df.iterrows():
    new_features = []
    for feature in row['features']:
        if feature in selected_features:
            new_features.append(feature)
    features_df.at[index, 'features'] = new_features

preprocessed_dataframe = pd.merge(filled_train['facilities'], features_df['features'], left_index=True,
                                  right_index=True)
X_train, X_test, y_train, y_test = train_test_split(preprocessed_dataframe['facilities'],
                                                    preprocessed_dataframe['features'], test_size=0.2, random_state=42)


In [7]:
preprocessed_dataframe

Unnamed: 0,facilities,features
0,This hotel is located in Păulestii Noi. The fr...,"[category (official), category (recommended), ..."
1,This hotel is located in Guarene. The hotel ha...,"[category (official), number of floors (main b..."
2,This hotel is located right in the heart of Mi...,"[category (official), total number of rooms, v..."
3,This hotel warmly welcomes guests in Ermington...,"[category (official), total number of rooms, a..."
4,This accommodation is located in Budapest. Thi...,"[category (official), category (recommended), ..."
...,...,...
973,This hotel warmly welcomes guests in Rye. For ...,"[category (official), category (recommended), ..."
974,This complex warmly welcomes guests in Jakarta...,"[category (official), category (recommended), ..."
975,"This hotel is located in Abu Dhabi, right on t...","[category (official), year of construction, nu..."
976,Set exclusively on a private peninsula and sta...,"[category (official), year of construction, nu..."


In [12]:
# Step 2: Preprocess the Text Data

# (Perform necessary preprocessing steps on the 'text' column of the data)

# Step 3: Tokenize and Pad the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_dataframe['facilities'])
sequences = tokenizer.texts_to_sequences(preprocessed_dataframe['facilities'])
vocab_size = len(tokenizer.word_index) + 1
max_len = 100  # Maximum sequence length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Step 4: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, preprocessed_dataframe['features'], test_size=0.2, random_state=42)

# Step 5: Build the CNN Model
embedding_dim = 100  # Dimensionality of the word embeddings
filters = 128  # Number of filters in the convolutional layer
kernel_size = 5  # Size of the filters
hidden_dims = 64  # Number of neurons in the dense layer

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 6: Train the Model
early_stopping = EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)
model.fit(np.array(X_train), np.array(y_train), validation_split=0.2, epochs=10, batch_size=64, callbacks=[early_stopping])

# Step 7: Evaluate the Model
loss, accuracy = model.evaluate(X_test, np.array(y_test))
print(f"Loss: {loss}, Accuracy: {accuracy}")

# Step 8: Predict Tags
new_text = ["New text example"]  # New, unseen text data
new_sequences = tokenizer.texts_to_sequences(new_text)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predicted_tags = model.predict_classes(new_padded_sequences)
print(f"Predicted tags: {predicted_tags}")

# Step 9: Iterate and Improve
# (Perform further iterations and improvements as necessary)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 100)          230400    
                                                                 
 conv1d_3 (Conv1D)           (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 302,849
Trainable params: 302,849
Non-trainable params: 0
________________________________________________

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).