In [38]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras

In [39]:
df = pd.read_csv('../Datasets/Country_city.csv')
df

Unnamed: 0,country,city
0,USA,New York
1,USA,Los Angeles
2,USA,Chicago
3,Canada,Toronto
4,Canada,Vancouver
...,...,...
113,Greece,Patras
114,Czech Republic,Ostrava
115,Hungary,Szeged
116,Austria,Innsbruck


In [40]:
# Assuming df is your original DataFrame with columns "country" and "city"
sequences = df.groupby('country')['city'].apply(list).reset_index(name='city_sequence')
sequences

Unnamed: 0,country,city_sequence
0,Argentina,"[Buenos Aires, Cordoba, Mendoza]"
1,Australia,"[Sydney, Melbourne, Brisbane, Perth]"
2,Austria,"[Vienna, Salzburg, Innsbruck]"
3,Belgium,"[Brussels, Antwerp, Ghent]"
4,Brazil,"[Rio de Janeiro, Sao Paulo, Brasilia, Fortaleza]"
5,Canada,"[Toronto, Vancouver, Montreal, Calgary]"
6,China,"[Beijing, Shanghai, Guangzhou, Hong Kong]"
7,Czech Republic,"[Prague, Brno, Ostrava]"
8,Denmark,"[Copenhagen, Aarhus, Odense]"
9,Egypt,"[Cairo, Alexandria, Luxor]"


In [41]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Assuming df is your original DataFrame with columns "country" and "city"
sequences = df.groupby('country')['city'].apply(list).reset_index(name='city_sequence')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences['city_sequence'])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for sequence in sequences['city_sequence']:
    # Tokenize the city names
    city_tokens = tokenizer.texts_to_sequences([sequence])[0]
    
    for i in range(1, len(city_tokens)):
        n_gram_sequence = city_tokens[:i + 1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max(len(seq) for seq in input_sequences)
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X, y = padded_input_sequences[:, :-1], padded_input_sequences[:, -1]
y = keras.utils.to_categorical(y, num_classes=total_words)


In [42]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_length - 1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=50, verbose=2)


Epoch 1/50
3/3 - 1s - loss: 4.7814 - accuracy: 0.0000e+00 - 970ms/epoch - 323ms/step
Epoch 2/50
3/3 - 0s - loss: 4.7750 - accuracy: 0.0000e+00 - 82ms/epoch - 27ms/step
Epoch 3/50
3/3 - 0s - loss: 4.7700 - accuracy: 0.0244 - 76ms/epoch - 25ms/step
Epoch 4/50
3/3 - 0s - loss: 4.7649 - accuracy: 0.0366 - 54ms/epoch - 18ms/step
Epoch 5/50
3/3 - 0s - loss: 4.7594 - accuracy: 0.0610 - 63ms/epoch - 21ms/step
Epoch 6/50
3/3 - 0s - loss: 4.7529 - accuracy: 0.0854 - 66ms/epoch - 22ms/step
Epoch 7/50
3/3 - 0s - loss: 4.7454 - accuracy: 0.0732 - 51ms/epoch - 17ms/step
Epoch 8/50
3/3 - 0s - loss: 4.7362 - accuracy: 0.0610 - 44ms/epoch - 15ms/step
Epoch 9/50
3/3 - 0s - loss: 4.7243 - accuracy: 0.0366 - 53ms/epoch - 18ms/step
Epoch 10/50
3/3 - 0s - loss: 4.7082 - accuracy: 0.0366 - 53ms/epoch - 18ms/step
Epoch 11/50
3/3 - 0s - loss: 4.6855 - accuracy: 0.0366 - 40ms/epoch - 13ms/step
Epoch 12/50
3/3 - 0s - loss: 4.6523 - accuracy: 0.0366 - 41ms/epoch - 14ms/step
Epoch 13/50
3/3 - 0s - loss: 4.6068 - a

<keras.src.callbacks.History at 0x295b49120>

In [43]:
def generate_city_sequence(model, tokenizer, seed_text, max_sequence_length, num_words_to_generate):
    result = seed_text
    for _ in range(num_words_to_generate):
        encoded = tokenizer.texts_to_sequences([result])[0]
        encoded = pad_sequences([encoded], maxlen=max_sequence_length - 1, padding='pre')
        predicted_word_index = np.argmax(model.predict(encoded), axis=-1)
        predicted_word = tokenizer.index_word[predicted_word_index[0]]
        result += ' ' + predicted_word
    return result

# Example usage
seed_country = 'Country_X'
generated_sequence = generate_city_sequence(model, tokenizer, seed_country, max_sequence_length, num_words_to_generate=5)




In [44]:
generated_sequence

'Country_X tampere tampere birmingham edinburgh edinburgh'

In [45]:
augmented_df = pd.DataFrame(columns=['country', 'city'])

for country in sequences['country']:
    generated_sequence = generate_city_sequence(model, tokenizer, country, max_sequence_length, num_words_to_generate=5)
    generated_cities = generated_sequence.split()[1:]  # Exclude the initial seed
    augmented_df = pd.concat([augmented_df, pd.DataFrame({'country': [country] * len(generated_cities), 'city': generated_cities})])

# Print the augmented DataFrame
print(augmented_df)


      country        city
0   Argentina     tampere
1   Argentina     tampere
2   Argentina  birmingham
3   Argentina   edinburgh
4   Argentina   edinburgh
..        ...         ...
0         USA     tampere
1         USA     tampere
2         USA  birmingham
3         USA   edinburgh
4         USA   edinburgh

[184 rows x 2 columns]


In [46]:
augmented_df

Unnamed: 0,country,city
0,Argentina,tampere
1,Argentina,tampere
2,Argentina,birmingham
3,Argentina,edinburgh
4,Argentina,edinburgh
...,...,...
0,USA,tampere
1,USA,tampere
2,USA,birmingham
3,USA,edinburgh
