# Import Libraries

In [None]:
# %pip install transformers
# !pip install torch==2.2.2

# %pip install country_converter

# %pip install spacy
# !python -m spacy download en_core_web_sm

# %pip install gensim

# %pip install roman
# %pip install re
# %pip install tqdm
# %pip install "torch-2.2.2+cu121-cp311-cp311-win_amd64.whl"
# %pip install bert-extractive-summarizer
# %pip install geocoder


In [None]:
from python import FileManager
from python import WordCleaner
from python import Indexer
from python import Matcher
from python import Evaluater
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import numpy as np
import torch
%load_ext autoreload
%autoreload 2

# Dataset Manipulation 

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict("wikir/documents.csv")
datasets = [dataset]

### The Ultimate Loader

In [None]:
vectorizer = TfidfVectorizer()
dataset = {}
for i in range(0,4):
    dataset = dataset | FileManager.csv_to_dict(f"wikir/R{i}.csv")
datasets = [dataset]

## Remove stop words

In [None]:
dataset = datasets[-1]
filtered_dataset = {}
for key in dataset:
    filtered_dataset[key] = WordCleaner.remove_stop_words(dataset[key])
datasets.append(filtered_dataset)

## Remove single letters

In [None]:
dataset = datasets[-1]
no_singles_dataset = {}
for key in dataset:
    no_singles_dataset[key] = WordCleaner.remove_single_letters(dataset[key])
datasets.append(no_singles_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_dataset = {}
for row in dataset:
    stemmed_dataset[row] = WordCleaner.stem(dataset[row], "Snowball")
datasets.append(stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmad_dataset = {}
for row in dataset:
    lemmad_dataset[row] = WordCleaner.lemmatize(dataset[row])
datasets.append(lemmad_dataset)

## Synonym Map

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

dataset = datasets[-1]
mapped_2 = {}

# Create a pool of workers
with Pool() as p:
    # Wrap your iterator (dataset) with tqdm for a progress bar
    for row in tqdm(dataset):
        # Apply the function to each word in the row in parallel
        mapped_2[row] = p.map(WordCleaner.get_unified_synonym_2, dataset[row])
datasets.append(mapped_2)

## Calculating tf-idf for the document

### using Scikit Learn

In [None]:
# tfidf_matrix = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
dataset_keys = list(datasets[-1].keys())

# Query Manipulation 

## Manual Query

In [None]:
query = "wikipedia"
query = word_tokenize(query)
query = WordCleaner.remove_stop_words(query)
query = WordCleaner.remove_single_letters(query)
# query = WordCleaner.stem(query, 'Snowball')
query = [WordCleaner.get_unified_synonym_2(word) for word in query]
query = WordCleaner.lemmatize(query)
print(query)

### Calculate TF-IDF

In [None]:
matrix = Indexer.calculate_doc_tf_idf([" ".join(query)],vectorizer)

### Calculate Cosine Similarity

In [None]:
similar_rows = Matcher.get_query_answers(tfidf_matrix, matrix, dataset_keys, 0.35)

# Sort the items in the dictionary by value (i.e., rating) in descending order
sorted_rows = sorted(similar_rows.items(), key=lambda item: item[1], reverse=True)

for row in sorted_rows:
    print(row)

## Evaluation Queries

In [None]:
queries = FileManager.csv_to_dict("wikir/test/queries.csv")

In [None]:
for key in queries.keys():
    queries[key] = WordCleaner.remove_stop_words(queries[key])
for key in queries.keys():
    queries[key] = WordCleaner.remove_single_letters(queries[key])
for key in queries.keys():
    queries[key] = [WordCleaner.get_unified_synonym_2(word) for word in queries[key]]
# for key in queries.keys():
#     queries[key] = WordCleaner.stem(queries[key], "Snowball")
for key in queries.keys():
    queries[key] = WordCleaner.lemmatize(queries[key])

### Calculate TF-IDF

In [None]:
queries_matrices = {}
for key in queries.keys():
    queries_matrices[key] = Indexer.calculate_doc_tf_idf([" ".join(queries[key])],vectorizer)

### Calculate Cosine Similarity

In [None]:
queriesAnswers = {}
for key in queries.keys():
    queriesAnswers[key] = Matcher.get_query_answers(tfidf_matrix,queries_matrices[key],dataset_keys,0.1)

# Evaluation

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_RRM2L_01.run")

In [None]:
Evaluater.evaluate("wikir/qrels","testrun_RRL_01.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_RRML_01.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_RML_01.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_bert_testing.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_embedding_5_epoch_1_tfidf_07.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_doc2vec_epoch_1_07.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_doc2vec_epoch_1_05.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_doc2vec_epoch_9_065.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_embedding_5_epoch_15_065.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_embedding_5_epoch_15_06.run")

In [None]:
Evaluater.evaluate("wikir/qrels","test runs/testrun_embedding_5_epoch_22_06.run")

In [None]:
Evaluater.evaluate("wikir/test/qrels","Tbert_docs_08.run")

In [None]:
Evaluater.evaluate("wikir/qrels","bert_docs_08.run")

# Write To Files

## Dataset

In [None]:
FileManager.write_dataset_to_file("wikir/RRM2L.csv",datasets[-1])

## Run File

In [None]:
FileManager.write_runfile_to_file("test runs/testrun_RRM2L_01.run",queries,queriesAnswers)

## Model

### Write

In [None]:
FileManager.write_model_to_drive("models/Model_RRM2L",vectorizer, dataset_keys, tfidf_matrix)

### Read

In [None]:
vectorizer, svd, dataset_keys, tfidf_matrix = FileManager.load_model_from_drive("models/Model_RM_NEW_L")

# Word Embedding

In [None]:
from gensim.models import Word2Vec

dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = dataset.values()
model = Word2Vec(sentences=tokenized_documents,
                          vector_size=250,  # Dimensionality of the word vectors (100 is Good for a medium-sized dataset)
                          window=5,         # Maximum distance between the current and predicted word within a sentence ( 5 Balances local and broader context)
                          sg=1,             # Skip-Gram model (1 for Skip-Gram (can capture complex patterns), 0 for CBOW)
                          min_count=1,      # Ignores all words with a total frequency lower than this (2 is Low enough to not lose infrequent words)
                          workers = 10
                          )

In [None]:
model.save("embedding_5_epoch_22.model")

In [None]:
# Load a saved model
from gensim.models import Word2Vec

dataset = datasets[-1]
# Convert to a list of tokenized documents
tokenized_documents = dataset.values()
model = Word2Vec.load("embedding_5_epoch_1.model")


In [None]:
# Train the model
model.train(tokenized_documents, total_examples=len(tokenized_documents), epochs=7)

In [None]:
import numpy as np

document_vectors = []
for doc in tokenized_documents:
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in doc if token in model.wv]
    # Calculate the average vector for each document
    if valid_tokens:  # Check if there are any valid tokens
        doc_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        document_vectors.append(doc_vector)
    else:
        # Handle documents with no valid tokens (e.g., empty documents)
        document_vectors.append(np.zeros(model.vector_size))

# Convert to a 2D array
document_vectors = np.array(document_vectors)


In [None]:
queries_matrices = {}
for key, query_tokens in queries.items():
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in query_tokens if token in model.wv]

    # Calculate the average vector for each query
    if valid_tokens:
        query_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        queries_matrices[key] = query_vector
    else:
        # Handle queries with no valid tokens
        print("Query with no valid tokens: " + key)
        queries_matrices[key] = np.zeros(model.vector_size)


In [None]:
from scipy import sparse
corpus_matrix_sparse = sparse.csr_matrix(document_vectors)
queries_answers_embedded = {}

for key in queries.keys():
    # Reshape the query vector to 2D
    query_vector_2d = queries_matrices[key].reshape(1, -1)
    # Calculate answers for one query at a time
    queries_answers_embedded[key] = Matcher.get_query_answers_optimized(corpus_matrix_sparse, query_vector_2d, dataset_keys, 0.6)


In [None]:
# Write to run file
FileManager.write_runfile_to_file('testrun_embedding_5_epoch_22_06.run', queries, queries_answers_embedded)

In [None]:
# Manual query
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Create a query vector for "tokyo disney resort"
query_words = ["apple"]
valid_vectors = [model.wv[word] for word in query_words if word in model.wv]

# Check if there are valid vectors to avoid nan issues
if valid_vectors:
    query_vector = np.mean(valid_vectors, axis=0).reshape(1, -1)
    # Compute cosine similarity between query and document vectors
    similarity_scores = cosine_similarity(query_vector, document_vectors)

    # Rank documents based on similarity scores
    sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

    # Print the ranked documents
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        print(f"Rank {rank}: Document {doc_id + 2} (Similarity Score = {score:.4f})")
else:
    print("None of the query words were found in the model's vocabulary.")


## Personalization

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import platform
user_os = str(platform.system())

history_query = ["tech"]
query = ["apple"]

vector_1 = [model.wv[word] for word in query if word in model.wv]
vector_2 = [model.wv[word] for word in history_query if word in model.wv]

# Check if there are valid vectors to avoid nan issues
if vector_1:
    query_vector_1 = np.mean(vector_1, axis=0).reshape(1, -1)
    query_vector_2 = np.mean(vector_2, axis=0).reshape(1, -1)

    # Assuming query_vector_1 and query_vector_2 are already defined as 2D arrays
    weighted_vector = 0.75 * query_vector_1 + 0.25 * query_vector_2
    # Compute cosine similarity between query and document vectors
    similarity_scores = cosine_similarity(weighted_vector, document_vectors)
    # Rank documents based on similarity scores
    sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

    # Print the ranked documents
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        print(f"Rank {rank}: Document in row {doc_id + 2} (Similarity Score = {score:.4f})")
else:
    print("None of the query words were found in the model's vocabulary.")

## Tests

In [None]:
# print(model.wv['malaysia'])
# print(model.wv.similarity('1st', 'First'))

print(model.wv.similarity('world', 'war'))
print(model.wv.similarity('good', 'best'))
print(model.wv.most_similar('war'))
print(model.wv.most_similar(positive=['king'], negative=['man']))

## Plot

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def display_pca_scatterplot(model, words=None, sample=0):
    if words is None:
        if sample > 0:
            words = np.random.choice(list(model.wv.index_to_key), sample)
        else:
            words = list(model.wv.index_to_key)
        
    word_vectors = np.array([model.wv[word] for word in words])

    # Determine the appropriate number of components (up to min(n_samples, n_features))
    n_components = min(word_vectors.shape[0], word_vectors.shape[1])

    if n_components > 1:
        twodim = PCA(n_components=n_components).fit_transform(word_vectors)[:, :2]
        plt.figure(figsize=(6, 6))
        plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
        for word, (x, y) in zip(words, twodim):
            plt.text(x + 0.05, y + 0.05, word)
        plt.show()
    else:
        print("Insufficient data for PCA visualization.")

# Example usage:
display_pca_scatterplot(model, ['battalion','world', 'war', 'good', 'best', 'state', 'government', 'university', 'college', 'germany', 'german', '12', 'twelve'])


In [None]:
pca = PCA(n_components=2)
x_pca = pca.fit_transform(document_vectors)

plt.figure(1, figsize=(30, 20),)
plt.scatter(x_pca[:, 0], x_pca[:, 1],s=100, alpha=0.2)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Perform PCA
pca = PCA(n_components=2)
x_pca = pca.fit_transform(list(document_vectors))

# Create a scatter plot
plt.figure(figsize=(30, 20))
plt.scatter(x_pca[:, 0], x_pca[:, 1], alpha=0.2, s=100)

# Optionally, add labels to the points
for i, doc_id in enumerate(document_vectors):
    plt.annotate(doc_id, (x_pca[i, 0], x_pca[i, 1]))

plt.show()

In [None]:
# For when we do clustring, assign the cluter to each doc and visualize here
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'document_topics' is a list of topics for each document
document_topics = ...

# Create a colormap for the topics
cmap = plt.get_cmap('tab10')
colors = [cmap(i) for i in range(len(set(document_topics)))]

# Perform PCA
pca = PCA(n_components=2)
x_pca = pca.fit_transform(list(document_vectors.values()))

# Create a scatter plot
plt.figure(figsize=(30, 20))
for i, topic in enumerate(set(document_topics)):
    plt.scatter(x_pca[document_topics == topic, 0], x_pca[document_topics == topic, 1], c=[colors[i]], label=topic, alpha=0.2, s=100)
plt.legend()
plt.show()


## Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

dataset = datasets[-1]

tagged_data = [TaggedDocument(words=words, tags=[doc_id]) for doc_id, words in dataset.items()]

# Build and train the model
model = Doc2Vec(vector_size=200, window=5, min_count=1, workers=8, epochs=1)
model.build_vocab(tagged_data)

In [None]:
model.save("model_doc2vec_epoch_9")

In [None]:
model = Doc2Vec.load("my_doc2vec_model")

In [None]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=3)

In [None]:
# Get the document vectors
document_vectors = {doc_id: model.dv[doc_id] for doc_id in dataset_keys}

# Infer a vector for each query
query_vectors = {query_id: model.infer_vector(words) for query_id, words in queries.items()}

In [None]:
from scipy import sparse

# Convert document_vectors to a sparse matrix
document_vectors_matrix = sparse.csr_matrix(list(document_vectors.values()))

# Use your get_query_answers function to retrieve relevant documents
queries_answers = {}
for query_id, query_vector in query_vectors.items():
    # Reshape the query vector to 2D
    query_vector_2d = query_vector.reshape(1, -1)
    queries_answers[query_id] = Matcher.get_query_answers(document_vectors_matrix, query_vector_2d, dataset_keys, 0.65)


In [None]:
# Write to run file
FileManager.write_runfile_to_file('testrun_doc2vec_epoch_9_065.run', queries, queries_answers)

In [None]:
# Manual query

from sklearn.metrics.pairwise import cosine_similarity

query_words = ["wikipedia"]
query_vector = model.infer_vector(query_words)

# Reshape the query vector to 2D
query_vector_2d = query_vector.reshape(1, -1)

# Compute cosine similarity between the query vector and document vectors
similarity_scores = cosine_similarity(query_vector_2d, list(document_vectors.values()))
# Rank documents based on similarity scores
sorted_docs = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1], reverse=True)

# Print the ranked documents
for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
    print(f"Rank {rank}: Document {doc_id + 2} (Similarity Score = {score:.4f})")


## Bert Model

In [None]:
from transformers import BertModel

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
from transformers import BertTokenizer
from tqdm import tqdm

dataset = datasets[-1]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize your documents
tokenized_documents = {doc_id: tokenizer.tokenize(' '.join(words)) for doc_id, words in tqdm(dataset.items())}


In [None]:
# Tokenize your queries
tokenized_queries = {query_id: tokenizer.tokenize(' '.join(words)) for query_id, words in queries.items()}

In [None]:
# Convert tokens to IDs
indexed_documents = {doc_id: tokenizer.convert_tokens_to_ids(words) for doc_id, words in tokenized_documents.items()}
indexed_queries = {query_id: tokenizer.convert_tokens_to_ids(words) for query_id, words in tokenized_queries.items()}

In [None]:
import torch
from torch.cuda.amp import autocast, GradScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
scaler = GradScaler()  # Initialize GradScaler for mixed precision training
document_vectors = {}

doc_ids = list(indexed_documents.keys())

batch_size = 500
doc_batches = [doc_ids[i:i + batch_size] for i in range(0, len(doc_ids), batch_size)]

counter = 1
for batch in doc_batches:
    print(f"Processing batch {counter} out of {len(doc_batches)}")
    # Calculate document vectors for the current batch
    batch_words = [indexed_documents[doc_id] for doc_id in batch]
    max_length = min(512, max(len(words) for words in batch_words))  # Find the maximum length, but do not exceed 512
    batch_words_padded = [words[:max_length] + [0]*(max_length-len(words)) for words in batch_words]  # Pad or truncate all sequences to the maximum length
    batch_words_tensor = torch.tensor(batch_words_padded).to(device)
    with torch.no_grad():
        with autocast():  # Enable autocasting for mixed precision
            batch_vectors = model(batch_words_tensor)[0][:, 0, :]
    for i, doc_id in enumerate(batch):
        document_vectors[doc_id] = batch_vectors[i].cpu()  # Move the vectors back to CPU
    torch.cuda.empty_cache()
    counter += 1


In [None]:
# import pandas as pd
# dftemp = pd.DataFrame(document_vectors)
document_vectors

In [None]:
# Calculate query vectors
query_vectors = {}
for query_id, words in indexed_queries.items():
    # Pad or truncate to the same max_length as used for documents
    words = words[:max_length] + [0]*(max_length-len(words))
    words_tensor = torch.tensor([words]).to(device)
    with torch.no_grad():
        with autocast():  # Enable autocasting for mixed precision
            # Check out the line under
            query_vectors[query_id] = model(words_tensor)[0][0][0].cpu()  # Move the vector back to CPU


In [None]:
document_vectors['1781133'].shape

In [None]:
len(query_vectors)

In [None]:
# Convert BERT embeddings to 2D numpy arrays
document_vectors_np = {doc_id: doc_vector.detach().cpu().numpy().reshape(1, -1) for doc_id, doc_vector in document_vectors.items()}
query_vectors_np = {query_id: query_vector.detach().cpu().numpy().reshape(1, -1) for query_id, query_vector in query_vectors.items()}

# Create corpus_matrix and query_matrix
corpus_matrix = np.vstack(list(document_vectors_np.values()))
query_matrix = np.vstack(list(query_vectors_np.values()))

In [None]:
list(document_vectors.keys())[548]

In [None]:
query_matrix.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(query_matrix4, query_matrix5).reshape(-1)
similarity_matrix

In [None]:
np.where(similarity_matrix > .25)

In [None]:
similarity_matrix.shape

In [None]:
np.where(similarity_matrix > 0.8)

In [None]:
from tqdm import tqdm

queries_answers = {}
for key in tqdm(queries.keys()):
    queries_answers[key] = Matcher.get_query_answers(corpus_matrix, query_vectors_np[key], dataset_keys, 0.95)

In [None]:
# Write to run file
FileManager.write_runfile_to_file('Tbert_docs_08III.run', queries, queries_answers)

In [None]:
Evaluater.evaluate("wikir/test/qrels","Tbert_docs_08II.run")

In [None]:
Evaluater.evaluate("wikir/test/qrels","Tbert_docs_08III.run")

In [None]:
# Manual Query
query_words = ["a graduate of franklin marshall college where he earned a degree in teaching and spent four years on several of its sports teams bridenbaugh coached football at several places in his home state of pennsylvania prior to being selected as the head coach of the geneva college golden tornadoes in 1917 he left geneva in 1922 with a 23 12 5 record and took a job with new castle junior senior high school as a mathematics teacher and head football basketball and track and field coach he did not lose a football game in his first two years marking the first of eleven undefeated seasons and over the course of 33 years won seven league titles in the sport leaving in 1955 with a 265 65 25 record he continued to work as an assistant football coach at grove city college until 1964 and was inducted into several regional halls of fame he died in june 1990 at the age of 100 bridenbaugh was born on may 1 1890 in martinsburg pennsylvania one of 14 children of professor phillip howard bridenbaugh an educator and academic administrator and catherine oelling he attended altoona area high school in altoona pennsylvania for one"]
# Tokenize and convert your query to IDs
tokenized_query = tokenizer.tokenize(' '.join(query_words))
indexed_query = tokenizer.convert_tokens_to_ids(tokenized_query)

# Pad or truncate to the same max_length as used for documents
indexed_query = indexed_query[:max_length] + [0]*(max_length-len(indexed_query))

# Move indexed_query to the correct device and calculate the query vector
indexed_query_tensor = torch.tensor([indexed_query]).to(device)
with torch.no_grad():
    with autocast():  # Enable autocasting for mixed precision
        query_vector = model(indexed_query_tensor)[0][0][0]

# Move query_vector to the CPU, detach it from the computation graph, and convert it to a numpy array
query_matrix = query_vector.cpu().numpy().reshape(1, -1)

# Use your function to get the most similar documents
similar_docs = Matcher.get_query_answers(corpus_matrix, query_matrix,dataset_keys, 0.85)

# Print the IDs of the top 5 most similar documents
for i, (doc_id, score) in enumerate(list(similar_docs.items())[:20]):
    print(f"Rank {i+1}, Document ID: {doc_id}, Similarity Score: {score}")


In [None]:
import numpy as np

# Assuming document_vectors is your dictionary of document vectors
doc_id = list(document_vectors.keys())[0]  # Get the ID of the first document
vector = document_vectors[doc_id]  # Get the vector for the first document
print("Vector dimensions:", np.array(vector).shape)


In [None]:
from summarizer import Summarizer
from transformers import BertModel

# Input text to be summarized
input_text = "As you move from left to right across a period, the number of protons in the nucleus increases. The electrons are thus attracted to the nucleus more strongly, and the atomic radius is smaller (this attraction is much stronger than the relatively weak repulsion between electrons). As you move down a column, there are more protons, but there are also more complete energy levels below the valence electrons. These lower energy levels shield the valence electrons from the attractive effects of the atom's nucleus, so the atomic radius gets larger."

# # Load pre-trained model (weights)
# model = BertModel.from_pretrained('bert-base-uncased')
# # Create a BERT extractive summarizer
# summarizer = Summarizer(custom_model=model)

# Create a BERT extractive summarizer
summarizer = Summarizer(model = 'bert-base-uncased')


# Generate the summary
summary = summarizer(input_text, min_length=30, max_length=300)  # You can adjust the min_length and max_length parameters

# Output the summary
print("Original Text:")
print(input_text)
print("\nSummary:")
print(summary)

In [None]:
import geocoder
g = geocoder.ip('me')
country = str(g.country)
city = str(g.city)
if g.country is None:
    country = "syria"
    city = "damascus"
print(country)
print(city)

In [None]:
import numpy as np
np.save("bert_docs_vecs_0",document_vectors)