# Installations

In [None]:
!python -m spacy download en_core_web_lg;

In [None]:
!pip install transformers;
!pip install sentence-transformers;
!pip install textacy

# Imports
Here all the necessary libraries are imported.

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from textacy.extract import keyterms as kt
import textacy
import spacy
import pandas as pd
import plotly.express as ex
import plotly.graph_objects as go

# ***Getting the KeyWords from the Text.***
For the input texts, I've chosen the descriptions of Automobile Manufacturing companies (Audi and BMW) from Wikipedia.

I will extract the keywords of Audi's and BMW's descriptions and store them in seperate lists (text1 and text2). 

Then I will use each keyword in Audi's list to find the most similar/relevant keyword from the BMW's list.

In [None]:
# Texts containing the descriptions of Audi and BMW. 
audi = """Audi AG is a German automobile manufacturer that designs, engineers, produces, markets and distributes luxury vehicles. Audi is a subsidiary of Volkswagen Group and has its roots at Ingolstadt, Bavaria, Germany. Audi vehicles are produced in nine production facilities worldwide.

The origins of the company are complex, going back to the early 20th century and the initial enterprises (Horch and the Audiwerke) founded by engineer August Horch; and two other manufacturers (DKW and Wanderer), leading to the foundation of Auto Union in 1932. The modern era of Audi essentially began in the 1960s when Auto Union was acquired by Volkswagen from Daimler-Benz.[10] After relaunching the Audi brand in the 1965, Volkswagen merged Auto Union with NSU Motorenwerke in 1969, thus creating the present-day form of the company.

The company name is based on the Latin translation of the surname of the founder, August Horch. "Horch", meaning "listen" in German, becomes "audi" in Latin. The four rings of the Audi logo each represent one of four car companies that banded together to create Audi's predecessor company, Auto Union. Audi's slogan is Vorsprung durch Technik, meaning "Being Ahead through Technology".[11] Audi, along with fellow German marques BMW and Mercedes-Benz, is among the best-selling luxury automobile brands in the world."""

bmw = """Bayerische Motoren Werke AG, commonly referred to as BMW, is a German multinational corporation which produces luxury vehicles and motorcycles. The company was founded in 1916 as a manufacturer of aircraft engines, which it produced from 1917 until 1918 and again from 1933 to 1945.

Automobiles are marketed under the brands BMW, Mini and Rolls-Royce, and motorcycles are marketed under the brand BMW Motorrad. In 2017, BMW was the world's fourteenth-largest producer of motor vehicles, with 2,279,503 vehicles produced.[3] The company has significant motorsport history, especially in touring cars, Formula 1, sports cars and the Isle of Man TT.

BMW is headquartered in Munich and produces motor vehicles in Germany, Brazil, China, India, Mexico, the Netherlands, South Africa, the United Kingdom, and the United States. The Quandt family is a long-term shareholder of the company (with the remaining shares owned by public float), following brothers Herbert and Harald Quandt's investments in 1959 which saved the company from bankruptcy."""

In [None]:
# Getting the top 5 keywords from the texts.
doc1 = textacy.make_spacy_doc(audi, lang='en_core_web_lg')
doc2 = textacy.make_spacy_doc(bmw, lang='en_core_web_lg')
 
text1 = kt.textrank(doc1, topn=5)
text2 = kt.textrank(doc2, topn=5)

# Printing the list of top 5 keywords with their ranks.
print('keywords from text1: \n')
print(text1)
print('\n')
print('keywords form text2: \n')
print(text2)

keywords from text1: 

[('Audi vehicle', 0.03231935433045273), ('Audi brand', 0.030765057433912852), ('Audi logo', 0.02974915488443276), ('Audi AG', 0.02887831688597195), ('german automobile manufacturer', 0.026964919289635694)]


keywords form text2: 

[('Bayerische Motoren Werke AG', 0.03154663474108812), ('brand BMW Motorrad', 0.026634136138545816), ('motor vehicle', 0.025808768111592585), ('luxury vehicle', 0.0234794070323003), ('significant motorsport history', 0.021848770906177335)]


## Removing the ranks because we don't need them.

In [None]:
# Removing the ranks.
def keywordsWithoutRanks(txt_list):
  for i in range (0, len(txt_list)):
    txt_list[i] = txt_list[i][0]
  return txt_list

text1 = keywordsWithoutRanks(text1)
text2 = keywordsWithoutRanks(text2)

# Printing the keywords without ranks.
print('keywords from text1 without ranks: \n')
print(text1)
print('\n')
print('keywords form text2 without ranks: \n')
print(text2)



keywords from text1 without ranks: 

['Audi vehicle', 'Audi brand', 'Audi logo', 'Audi AG', 'german automobile manufacturer']


keywords form text2 without ranks: 

['Bayerische Motoren Werke AG', 'brand BMW Motorrad', 'motor vehicle', 'luxury vehicle', 'significant motorsport history']


Now since we have both the lists of keywords, it is time to calculate the similarities.

# ***Calculating the most similar/relevant words.*** 

In [None]:
# Loading the model
model = SentenceTransformer('stsb-roberta-large')

In [None]:
# encoding text to get their embeddings  

# Audi's text
embedding1 = model.encode(text1, convert_to_tensor=True)  
# BMW's text
embedding2 = model.encode(text2, convert_to_tensor=True)

# Computing the similarity scores of two embeddings using pytorch_cos_sim(). 
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)

# Creating a List for storing the most similar/relevant keyword pairs with their similairity values.
relevant_list = []
# Creating a List for storing all keyword pairs with similarity values.
listForVisualization = []

for i in range(len(text1)):
  service = [] 
  service2 = [] 
  for j in range(len(text2)):  
    tuple = (text1[i], text2[j], cosine_scores[i][j].item())
    # Storing the cosine similiratiy scores to identify the best matching word.
    service.append(cosine_scores[i][j].item())
    service2.append(tuple)
    listForVisualization.append(tuple)
  # Getting the index of the keyword pair with highest similarity score to store the data in a list (relevant_list).
  relavant = max(service)
  max_index = service.index(relavant) 
  relevant_list.append(service2[max_index])
    

## Most relevant matches for Audi's keywords (list1) from the BMW's keyword list (list2).

In [None]:
relevant_list 

[('Audi vehicle', 'luxury vehicle', 0.5762524008750916),
 ('Audi brand', 'luxury vehicle', 0.4180949032306671),
 ('Audi logo', 'luxury vehicle', 0.3856440782546997),
 ('Audi AG', 'Bayerische Motoren Werke AG', 0.22442494332790375),
 ('german automobile manufacturer',
  'Bayerische Motoren Werke AG',
  0.4975980818271637)]

## List containing similarity values of all the words from both the lists.

In [None]:
listForVisualization 

[('Audi vehicle', 'Bayerische Motoren Werke AG', 0.13278791308403015),
 ('Audi vehicle', 'brand BMW Motorrad', 0.21389172971248627),
 ('Audi vehicle', 'motor vehicle', 0.4209316372871399),
 ('Audi vehicle', 'luxury vehicle', 0.5762524008750916),
 ('Audi vehicle', 'significant motorsport history', 0.35544174909591675),
 ('Audi brand', 'Bayerische Motoren Werke AG', 0.13299959897994995),
 ('Audi brand', 'brand BMW Motorrad', 0.2045164555311203),
 ('Audi brand', 'motor vehicle', 0.1044556200504303),
 ('Audi brand', 'luxury vehicle', 0.4180949032306671),
 ('Audi brand', 'significant motorsport history', 0.16263872385025024),
 ('Audi logo', 'Bayerische Motoren Werke AG', 0.040211863815784454),
 ('Audi logo', 'brand BMW Motorrad', 0.09030995517969131),
 ('Audi logo', 'motor vehicle', 0.09595515578985214),
 ('Audi logo', 'luxury vehicle', 0.3856440782546997),
 ('Audi logo', 'significant motorsport history', 0.20535334944725037),
 ('Audi AG', 'Bayerische Motoren Werke AG', 0.22442494332790375)

# ***VISUALIZATION OF RESULTS***

In [None]:
# Defining a function for creating a dataframe of the lists containing similarity calculations.
def getData(lst):
  listOfCosineValues = []
  listOfWordPairs = []
  for i in range(len(lst)):
    string = lst[i][0]+'   &   '+lst[i][1]
    listOfCosineValues.append(lst[i][2]) 
    listOfWordPairs.append(string)
  data = dict(similarity = listOfCosineValues, words = listOfWordPairs)
  data = pd.DataFrame(data)
  return data

In [None]:
df1 = getData(relevant_list)
df2 = getData(listForVisualization)

In [None]:
fig = ex.line_polar(r = df1.similarity, theta=df1.words, line_close=True, width=1500, height=500, title="Most relevant words to Audi's list from BMW's list",
                     template='plotly_dark', line_shape='spline')
fig.update_traces(fill="toself")
fig.show()

In [None]:
fig2 = ex.line_polar(r = df2.similarity, theta=df2.words, line_close=True, width=1600, height=900, title='Word Similarity chart for all word pairs from both the lists',
                    template='plotly_dark', line_shape='spline')
fig2.update_traces(fill="toself")
fig2.show()