In [1]:
import pandas as pd
import numpy as np
import scrapy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [2]:
train = pd.read_csv('input.csv')
validation = pd.read_csv('output.csv')

facility_enum = open('facility_enum.txt', 'r')
Lines = facility_enum.readlines()
facility_enum.close()

# provide new dataframe with edited tags
filled_train = []
# find name according to code number
code_name_enumeration = {}

for line in Lines:
    code_split_name = list(line.split('\t'))
    code_split_name[1] = code_split_name[1][:-1]
    code_name_enumeration[code_split_name[0]] = code_split_name[1]


In [3]:
for index, row in validation.iterrows():
    validation.at[index, 'facility_code'] = code_name_enumeration[row['facility_code']].lower().strip()
validation.rename(columns={'facility_code': 'facility'})

for index, row in train.iterrows():
    hotel_id = row.hotel_id
    text = row.content
    if type(text) is not str:
        # print(f'There is not any provided content for {hotel_id} Hotel')
        continue
    items = scrapy.Selector(text=text).css('.hotel-description-content::text').extract()
    # location = items[0]
    # facilities = items[1:]
    facilities = items
    merged_facilities = ''
    for facility in facilities:
        merged_facilities += facility + ' '

    merged_facilities.lower().strip()
    prepare_new_row = [hotel_id, merged_facilities]
    filled_train.append(prepare_new_row)

filled_train = pd.DataFrame(filled_train, columns=['hotel_id', 'facilities'])
# print(validation)
# print(filled_train)

facility_codes = validation['facility_code'].tolist()
unique_facility_codes = set(facility_codes)
hotel_ids = filled_train['hotel_id'].tolist()
unique_hotel_ids = set(hotel_ids)



In [4]:
features_of_hotel = []

for hotel_id in unique_hotel_ids:
    hotel = validation[validation.hotel_id == hotel_id]
    features = hotel['facility_code'].tolist()
    features_of_hotel.append([hotel_id, features])

features_df = pd.DataFrame(features_of_hotel, columns=['hotel_id', 'features'])
# print(features_df)
# print(filled_train)

number_of_occurrence = {}
for index, row in features_df.iterrows():
    features_line = row['features']
    for feature in features_line:
        if number_of_occurrence.get(feature):
            number_of_occurrence[feature] += 1
        else:
            number_of_occurrence[feature] = 1

# Creating histogram
# fig, ax = plt.subplots(figsize=(10, 7))
# ax.hist(number_of_occurrence.values())
# plt.show()

number_of_occurrence_filtered = {}
for key, value in number_of_occurrence.items():
    if value > 600:
        number_of_occurrence_filtered[key] = value


In [5]:
print(len(number_of_occurrence))
print(len(number_of_occurrence_filtered))


231
15


In [6]:
selected_features = list(number_of_occurrence_filtered.keys())
for index, row in features_df.iterrows():
    new_features = []
    for feature in row['features']:
        if feature in selected_features:
            new_features.append(feature)
    features_df.at[index, 'features'] = new_features

preprocessed_dataframe = pd.merge(filled_train['facilities'], features_df['features'], left_index=True, right_index=True)


In [7]:
preprocessed_dataframe

Unnamed: 0,facilities,features
0,This hotel is located in Păulestii Noi. The fr...,"[category (official), restaurant(s), internet ..."
1,This hotel is located in Guarene. The hotel ha...,"[category (official), total number of rooms, v..."
2,This hotel is located right in the heart of Mi...,"[category (official), total number of rooms, v..."
3,This hotel warmly welcomes guests in Ermington...,"[category (official), total number of rooms, v..."
4,This accommodation is located in Budapest. Thi...,"[category (official), total number of rooms, v..."
...,...,...
973,This hotel warmly welcomes guests in Rye. For ...,"[category (official), wlan access, car park, l..."
974,This complex warmly welcomes guests in Jakarta...,"[category (official), restaurant(s), internet ..."
975,"This hotel is located in Abu Dhabi, right on t...","[category (official), total number of rooms, v..."
976,Set exclusively on a private peninsula and sta...,"[category (official), total number of rooms, v..."


In [8]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_dataframe['facilities'])

tf_idf_array = tfidf_features.toarray()

words_set = tfidf_vectorizer.get_feature_names_out()

df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

df_tf_idf

Unnamed: 0,000,10,100,1026,104,105,106,107,108,11,...,zhangjiakou,zhangzhou,zhuhai,zhujiajiao,zinkwazi,zone,zqn,zth,zyl,ölüdeniz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
features_df = preprocessed_dataframe['features']
features_df = pd.DataFrame(features_df)
for index, row in features_df.iterrows():
    new_features = []
    for feature in row['features']:
        i = selected_features.index(feature)
        new_features.append(i)

    my_str = '1'
    for i in range(15):
        if i in new_features:
            my_str += '1'
        else:
            my_str += '0'

    features_df.at[index, 'features'] = int(my_str)
features_df

Unnamed: 0,features
0,1111111111100000
1,1100110010011110
2,1110111111111110
3,1100111110111111
4,1111111111111101
...,...
973,1100110000000011
974,1111111111100011
975,1111111111111111
976,1101111111111111


In [10]:
# Step 3: Tokenize and Pad the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_dataframe['facilities'])
sequences = tokenizer.texts_to_sequences(preprocessed_dataframe['facilities'])
vocab_size = len(tokenizer.word_index) + 1

max_len = 0
for sec in sequences:
    if max(sec) > max_len:
        max_len = max(sec)

padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Step 4: Split the Dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, features_df['features'], test_size=0.2, random_state=42)

In [11]:
# Step 4: Split the Dataset
X_train, X_test, y_train_text, y_test_text = train_test_split(df_tf_idf, preprocessed_dataframe['features'], test_size=0.2, random_state=42)

In [12]:
# Step 5: Convert Multi-Label Tags to Binary Vectors
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train_text)
y_test = mlb.transform(y_test_text)


# Step 6: Build the CNN Model
embedding_dim = 100  # Dimensionality of the word embeddings
filters = 6  # Number of filters in the convolutional layer
kernel_size = 5  # Size of the filters
hidden_dims = 10  # Number of neurons in the dense layer

model = Sequential()
model.add(Embedding(2272, embedding_dim, input_length=2272))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 7: Train the Model
early_stopping = EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)
model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=64, callbacks=[early_stopping])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2272, 100)         227200    
                                                                 
 conv1d (Conv1D)             (None, 2268, 6)           3006      
                                                                 
 global_max_pooling1d (Globa  (None, 6)                0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                70        
                                                                 
 dense_1 (Dense)             (None, 15)                165       
                                                                 
Total params: 230,441
Trainable params: 230,441
Non-trainable params: 0
__________________________________________________

<keras.callbacks.History at 0x7f03ae0e0520>

In [13]:
# Step 3: Tokenize and Pad the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_dataframe['facilities'])
sequences = tokenizer.texts_to_sequences(preprocessed_dataframe['facilities'])
vocab_size = len(tokenizer.word_index) + 1

# max_len = 0
# for sec in sequences:
#     if max(sec) > max_len:
#         max_len = max(sec)
max_len = 100

padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Step 4: Split the Dataset
X_train, X_test, y_train_text, y_test_text = train_test_split(padded_sequences, preprocessed_dataframe['features'], test_size=0.2, random_state=42)

In [14]:
# Step 2: Preprocess the Text Data

# (Perform necessary preprocessing steps on the 'text' column of the data)

# Step 3: Tokenize and Pad the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_dataframe['facilities'])
sequences = tokenizer.texts_to_sequences(preprocessed_dataframe['facilities'])
vocab_size = len(tokenizer.word_index) + 1
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Step 4: Split the Dataset
# X_train, X_test, y_train_text, y_test_text = train_test_split(padded_sequences, preprocessed_dataframe['features'], test_size=0.2, random_state=42)

# Step 5: Convert Multi-Label Tags to Binary Vectors
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train_text)
y_test = mlb.transform(y_test_text)

# Step 6: Build the CNN Model
embedding_dim = 1000  # Dimensionality of the word embeddings
filters = 128  # Number of filters in the convolutional layer
kernel_size = 5  # Size of the filters
hidden_dims = 10  # Number of neurons in the dense layer

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims, activation='relu'))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 7: Train the Model
early_stopping = EarlyStopping(patience=3, monitor='val_loss', restore_best_weights=True)
model.fit(X_train, y_train, validation_split=0.2, epochs=1, batch_size=64, callbacks=[early_stopping])




Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 1000)         2304000   
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           640128    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 dense_3 (Dense)             (None, 15)                165       
                                                                 
Total params: 2,945,583
Trainable params: 2,945,583
Non-trainable params: 0
____________________________________________

<keras.callbacks.History at 0x7f03abebba90>

In [15]:
# Step 8: Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy *100}")

Loss: 0.5997318625450134, Accuracy: 56.63265585899353


In [23]:
# Step 9: Predict Tags
new_text = ["This hotel is located in Păulestii Noi. The friendly staff at the reception desk are happy to answer any questions. Amenities include a baggage storage service, a safe and a cash machine. Wireless internet access (no extra charge) allows guests to stay connected while on holiday. A lift and wheelchair-accessible facilities are available. Everyday necessities can be purchased at the supermarket. The grounds of the hotel feature a playground and a lovely garden. Additional amenities include a TV room and a library. Guests arriving by car can park their vehicles in the car park for no extra charge. Further services and facilities include a transfer service, room service, a hairdresser and a coin-operated laundry. Lectures, presentations and meetings can be held in one of the 8 conference rooms. Air conditioning and central heating ensure that rooms maintain comfortable temperatures. A balcony is included as standard in some rooms. Separate bedrooms are available. Children's beds can be requested for younger guests. A safe and a minibar are also available. Extra comforts include a tea/coffee station. Other features include a telephone, a TV and WiFi. Guests will also find slippers in their rooms. A hairdryer and bathrobes are available in the bathrooms, which are equipped with a shower and a bathtub. Wheelchair-friendly rooms can be booked. The outdoor pool complex includes a children's swimming area and is ideal for working out or just relaxing. Refreshing drinks at the pool bar and a relaxing soak in the hot tub offer the perfect way to unwind. A short break or an entire afternoon on the sun terrace, which features sun loungers and parasols, is time well spent. For guests who wish to keep active, cycling/mountain biking and tennis are available. Both short- and long-stay guests are welcome to use the amenities available at the hotel, including a gym, billiards and aerobics. Various wellness options are available at the hotel, including a spa, a sauna, a steam bath, a hammam, a beauty salon and massage treatments. Children are well looked after in the kids' club. The hotel includes a restaurant and a café. Catering options include half board, full board and all-inclusive. A generous breakfast buffet, lunch and dinner offer plenty of delicious variety.	"]  # New, unseen text data
new_sequences = tokenizer.texts_to_sequences(new_text)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predicted_probs = model.predict(new_padded_sequences)
threshold = 0.5
predicted_tags = mlb.classes_[predicted_probs[0] >= threshold]
print('*******************')
print(f"Predicted tags: {predicted_tags}")
print('*******************')
print(f'''Real tags: category (official),restaurant(s),internet access,wlan access,
car park,bathroom,shower,hairdryer,internet access,
centrally regulated air conditioning,tv''')

*******************
Predicted tags: ['bathroom' 'car park' 'category (official)'
 'centrally regulated air conditioning' 'internet access'
 'laundry facilities' 'restaurant(s)' 'shower' 'tv' 'visa' 'wlan access']
*******************
Real tags: category (official),restaurant(s),internet access,wlan access,
car park,bathroom,shower,hairdryer,internet access,
centrally regulated air conditioning,tv
