# Starting Code for Exercise 6

### Import Modules and Download Data

In [None]:
import re
import nltk
import requests
import pandas as pd

from tqdm import tqdm
from io import StringIO
from gensim.models import LdaModel
from nltk.stem.porter import PorterStemmer
from gensim.corpora.dictionary import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', None)

### Function definition section

#### Text Preprocessing functions

In [None]:
def prep_process_tokenize(text):
    #websites, email and any punctuation cleaning
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower() # lower case the text
    text = nltk.word_tokenize(text)
    #removing stopwords
    text = [word for word in text if word not in stopwords]
    #stemming
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1]
    except IndexError:
        pass
    return text

In [None]:
def pre_process(text):
    return " ".join(prep_process_tokenize(text))

#### Training LDA Model function

In [None]:
def train_lda(data, num_topics, chunksize):
  num_topics = num_topics
  chunksize = chunksize
  dictionary = corpora.Dictionary(data['tokenized'])

  corpus = [dictionary.doc2bow(doc) for doc in data['tokenized']]
  lda = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, alpha=1e-2, eta=0.5e-2, chunksize=chunksize)
  
  return lda, dictionary, corpus

#### Jensen Shannon function

In [None]:
def jensen_shannon(query, matrix, num_topics, num_documents):
    p = np.matrix([query for i in range(num_documents)]).T
    q = matrix.T #3 * 2000
    m = 0.5*(p + q)
    return np.sqrt(0.5*(entropy(p, m) + entropy(q, m)))

In [None]:
def get_top_k_similar_docs(query, matrix, num_topics, num_documents, k=10):
    sims = jensen_shannon(query, matrix, num_topics, num_documents)
    return sims.argsort()[:k]

#### Matrix padding function
This must be used in order to make things work. <br>
Nice and tidy. <br>
It pads. <br>
Nothing more, nothing less.

In [None]:
def matpad(docmat, num_topics):
  stub_mat = []
  for doc in docmat:
    present = set(map(lambda x: x[0], doc))
    if present == num_topics:
      stub_mat.append(doc)
      continue
    topic_sum = sum(map(lambda x: x[1], doc))
    reminders = (1 - topic_sum) / (num_topics - len(present))
    d_doc = dict(doc)
    stub_mat.append([d_doc[i] if i in present else reminders for i in range(num_topics)])
  return np.asarray(stub_mat)

# Starting code for Exercise 6

In [None]:
url_data = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTxbA16lnYbtH-j6PPrPogc6ft03gp0y5mmo1Nq3l-Pxnb05nP1C-mOxUYvTciA2gq5nkwAqz9Y7Imi/pub?gid=646892609&single=true&output=tsv'

In [None]:
def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    return df

In [None]:
df = load_dataset(url_data)

### Inspect the Dataset

In [None]:
df.head(15)

Unnamed: 0,name,description,country,founding_date,relevancy
0,Pandora Car Rental,"Welcome to Pandora Car Rental, Car Hire and Airport Transfers based in Dalaman Turkey. We have a wide range of cars to suit all budgets and can deliver your car for free anytime day or night within the Dalaman locality. Reasons to Book Car from Pandora Car Rental: Unlimited Milage VAT All Local Taxes Airport Service Charge where applicable 24 hours Road Service Third Party Insurance with NO Excess Theft Insurance with NO Excess Fire Insurance with NO Excess FDW Insurance with NO Excess CDW Collision Damage Waiver with NO Excess TWH Tyre Windscreen Headlight Insurance with NO Excess 3 Additional Drivers Child/Baby Seat (must be ordered) No Hidden Extras Address: Hadrian Flats Number 4 Wellington Telford Pin Code: TF11RQ Tel: + 44 776 558 66 77 Website: http://www.dalamancarrental.com",United Kingdom,2011-04-05,0
1,SurplusMatch,"SurplusMatch is an online marketplace for contractors, Merchants and Manufacturers to buy and sell old stocks, end of line, slight seconds and site surplus materials. Sellers list the details of stock, buyers browse and order the materials and SurplusMatch delivers them while maintaining the anonymity of the seller.",United Kingdom,2008-01-01,2
2,Gimenez Ganga,"Giménez Ganga is a company that has been providing solutions for windows, sunlight protection and decoration since 1959.",Switzerland,1959-01-01,0
3,SMC3,"Freight shippers, motor carriers, logistics service providers and other supply chain professionals look to SMC³ for the technology, industry data, educational services and general know-how to achieve greater success in the transportation marketplace. The company is best known for its CzarLite base rates, which are used as a price benchmark in transportation agreements. Additionally SMC³ produces a range of enabling technologies that give shippers and carriers complete visibility into their transportation spend. Each January and June, SMC³ hosts a comprehensive supply chain conference that gives industry professionals an inside look into cost-saving trends and best business practices as well as timely economic and legislative updates.",United States,1935-01-01,0
4,Much Asphalt,Much Asphalt is southern Africa’s commercial supplier of an extensive range of hot and cold asphalt products to the road construction economy. Much Asphalt owns and operates 15 static plants in the major centres of South Africa and is the majority shareholder in East Coast Asphalt which operates two more in East London and Mthatha.,South Africa,1965-01-01,0
5,The Hisey Company,"The Hisey Company provides quality arbor care for consumers, providing a professional grade of service far beyond anything available in the marketplace today. With a focus on professionalism and quality of customer service, the company has created a experience that every customer will be hard pressed to find with any other service company. In a industry with many """"competitors"""", we set ourselves apart by providing service far above and beyond our competitors very best. This is clearly evident in the growth and revenues that we have accomplished over the past 36 months. Without turning our focus away from continuing to grow, we aim to continue to build lasting relationships with every customer that will bring them back again.",United States,2011-02-19,0
6,"FREIGHTALIA, LTD.","#1 Automatic quoting system ever created for Freight Forwarders, fully adaptable to your clientele, country or service needs. With Freightalia you can send beautiful quotes to your customers in real time. Know exactly when a quote was issued, when it was viewed, when it was accepted, and when it’s still pending.",United Kingdom,2015-09-26,0
7,Instant Access Au,"Instant Access is a provider of Access equipment including aluminium scaffolding, mobile towers, Elevated Working Platform, swing stages and specialized access solutions tailored to specific customers. Instant Access were the first company in Australia to offer mobile aluminium mobile scaffolds.",Australia,1968-01-01,1
8,CANOR International,"CANOR International provides project management, design, and consulting services. It offers urban planning services, such as development planning, space planning, architectural and engineering design, interior design, and CAD drafting; and project management services, including technical inspection, construction management, engineering audits, financial engineering, engineering consultancy, environmental engineering and consultancy, and facility management. The company also designs exhibition centers, as well as hotels and thermal baths.",Hungary,1993-01-01,0
9,LISUTO,LISUTO is a Multi-language batch exhibition system business,Japan,2016-11-01,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           2000 non-null   object
 1   description    2000 non-null   object
 2   country        2000 non-null   object
 3   founding_date  2000 non-null   object
 4   relevancy      2000 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 78.2+ KB


Null values check

In [None]:
df.isnull().values.any()

False

### Data preprocessing and tokenization

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
stemmer = PorterStemmer()

# Part 1: Tf-Idf Based Approach (Vector Space Modeling)

In this cell, we are fitting the descriptions of the companies to the TF-IDF vectorizer. Then by using fit_transform, we are transforming the descriptions into matrix form. As an output, we obtained 2000x10364 sparse matrix.

In [None]:
tfidf = TfidfVectorizer(preprocessor=pre_process).fit_transform(df.description)

*   In this part, first we obtain the index of company called "Vahanalytics". Thus, doc_index_to_compare variable is assigned with this index number which is 695.
*   Then, we set top_k variable to 5. This variable will be used later code snippets.
*   Then we compute cosine similarities between company "Vahanalytics" and all other companies. Please note that cosine similarities are computed in matrix space which we obtained after TF-IDF vectorizer and flatten the array into 1D vector by flatten() function. 




In [None]:
doc_index_to_compare = df.index[df['name'] == "Vahanalytics"].tolist()[0]
top_k = 5
cosine_similarities = cosine_similarity(tfidf[doc_index_to_compare:doc_index_to_compare + 1], tfidf).flatten()



*   After obtaining cosine similarities, we first sort the similarities with argsort() function. This returns an array of indices of sorted array.
*   Then, we slice this sorted array of indices such that we obtain indices of the most similar 5 companies, acc. to vector space compared to given company. We assign those indices to related_docs_indices variable.



In [None]:
related_docs_indices = cosine_similarities.argsort()[:-top_k - 1:-1]



*   Finally, by using those indices, we look for the observations carry those indices with "df.index.isin" function. Since each observation has unique index, we create dataframe consists of the most similar companies and their information given "Vahanalytics" by using the indices that we found in the previous step.



In [None]:
tfidf_result_df = df[df.index.isin(related_docs_indices)]



*   tfidf_result_df is the dataframe that consists of the the most 5 similary companies, including prior, w.r.t cosine similarity metric.



In [None]:
tfidf_result_df

Unnamed: 0,name,description,country,founding_date,relevancy
93,Ship Supplies Direct,We aim to use digital technology to transform the marine logistics industry,Singapore,2018-05-14,1
656,BISAF,"BISAF is a technological company for the construction industry. We specialise in cutting edge solutions that make building easier, safer and environmentally friendly.",United Kingdom,2006-05-01,1
695,Vahanalytics,Vahanalytics aims to create better drivers and safer roads by using cutting edge big data and machine learning techniques.,India,2016-01-01,1
1542,GeoSpock,"GeoSpock brings together their expertise of big data engineering to unlock the hidden value of data silos in your organization. Their solution enables you to manage extreme amounts of data at speed enabling your organization to react to key insights in a timely manner for future business success. The technology enables a range of capabilities from data analytics, visualization of spatial data, cutting edge data indexing, custom querying of data sets, and data intelligence. To ensure that their customers get the maximum impact using the GeoSpock solution they work with them on a one to one basis as they understand that each organization approaches their data problems in a bespoke manner, this ensures that you get maximum business impact. In bringing together multiple datasets this enables the cost of data generation to be amortized over many applications, opening up new business models and monetization opportunities, therefore, bringing value to your business. They work across a number of markets including smart cities, automotive, mobile networks, IoT, enterprise, AdTech, asset management, and logistics.",United Kingdom,2013-01-01,1
1982,Axenda,"Axenda is a cloud-based software platform for construction management industry. The software platform is used by constructors and architects to manage day-to-day tasks and grow their businesses. The company's patent-pending algorithm uses machine learning to estimate materials & resources. It aims to predict project's estimates & completion deadlines. In addition, the platform also translates the data into 3D virtual models which give visual feedback of project's progress to clients.",Mexico,2017-01-01,2


#### Extending the code for “Much Asphalt” 

In [None]:
doc_index_to_compare2 = df.index[df['name'] == "Much Asphalt"].tolist()[0]
top_k = 5
cosine_similarities2 = cosine_similarity(tfidf[doc_index_to_compare2:doc_index_to_compare2 + 1], tfidf).flatten()
related_docs_indices2 = cosine_similarities2.argsort()[:-top_k - 1:-1]
tfidf_result_df_2 = df[df.index.isin(related_docs_indices2)]
tfidf_result_df_2

Unnamed: 0,name,description,country,founding_date,relevancy
4,Much Asphalt,Much Asphalt is southern Africa’s commercial supplier of an extensive range of hot and cold asphalt products to the road construction economy. Much Asphalt owns and operates 15 static plants in the major centres of South Africa and is the majority shareholder in East Coast Asphalt which operates two more in East London and Mthatha.,South Africa,1965-01-01,0
57,Sunland Asphalt,"Sunland Asphalt, a commercial asphalt paving company in Phoenix, provides commercial asphalt paving service at competitive price.",United States,1979-01-01,0
618,Central-Allied Enterprises,"Central States Construction was founded in 1929 by Ernest W. Hallett to produce sand and gravel and construct concrete highways in Minnesota. The business was successful, and in the early 1940s, operations expanded to western Ohio. In the 1940s, the company was heavily involved in the wartime expansion of Wright-Patterson Air Force Base and the post-war construction of the Ohio Turnpike. By the early 1950s, Ohio operations had expanded to include production of sand, gravel, asphalt, and concrete. The Ohio-based portion of the business became known as Allied Enterprises, and it made its permanent presence in Northeastern Ohio by the end of the 50s. Today, Central-Allied Enterprises is one of northeastern Ohio's leading producers of sand, gravel, asphalt, and paved asphalt surfaces.",United States,1929-01-01,0
862,FAST FELT,"The patented product FAST FELT®, with its plastic tabs pre-affixed to the asphalt saturated felt (commonly called ""tar paper"") is the only significant improvement in the recent history of the asphalt saturated felt underlayment products market.",United States,2007-01-01,0
1443,Saldus Celinieks,"Saldus Celinieks is specialising in road construction, extraction of aggregates and asphalt production.",Latvia,1991-01-01,1


In [None]:
print("Company Name:", tfidf_result_df_2.loc[4]["name"],"\n")
print("Description")
tfidf_result_df_2.loc[4]["description"]

Company Name: Much Asphalt 

Description


'Much Asphalt is southern Africa’s commercial supplier of an extensive range of hot and cold asphalt products to the road construction economy. Much Asphalt owns and operates 15 static plants in the major centres of South Africa and is the majority shareholder in East Coast Asphalt which operates two more in East London and Mthatha.'

In [None]:
print("Company Name:", tfidf_result_df_2.loc[57]["name"],"\n")
print("Description")
tfidf_result_df_2.loc[57]["description"]

Company Name: Sunland Asphalt 

Description


'Sunland Asphalt, a commercial asphalt paving company in Phoenix, provides commercial asphalt paving service at competitive price.'

In [None]:
print("Company Name:", tfidf_result_df_2.loc[618]["name"],"\n")
print("Description")
tfidf_result_df_2.loc[618]["description"]

Company Name: Central-Allied Enterprises 

Description


"Central States Construction was founded in 1929 by Ernest W. Hallett to produce sand and gravel and construct concrete highways in Minnesota. The business was successful, and in the early 1940s, operations expanded to western Ohio. In the 1940s, the company was heavily involved in the wartime expansion of Wright-Patterson Air Force Base and the post-war construction of the Ohio Turnpike. By the early 1950s, Ohio operations had expanded to include production of sand, gravel, asphalt, and concrete. The Ohio-based portion of the business became known as Allied Enterprises, and it made its permanent presence in Northeastern Ohio by the end of the 50s.  Today, Central-Allied Enterprises is one of northeastern Ohio's leading producers of sand, gravel, asphalt, and paved asphalt surfaces."

In [None]:
print("Company Name:", tfidf_result_df_2.loc[862]["name"],"\n")
print("Description")
tfidf_result_df_2.loc[862]["description"]

Company Name: FAST FELT 

Description


'The patented product FAST FELT®, with its plastic tabs pre-affixed to the asphalt saturated felt (commonly called "tar paper") is the only significant improvement in the recent history of the asphalt saturated felt underlayment products market.'

## Results somehow make sense. "Much Asphalt" is compared with other 4 companies above. However, obviously they could be better. 

# Part 2
## Topic Modeling Using LDA

In [None]:
from gensim import models, corpora, similarities
from nltk import FreqDist
import numpy as np
from scipy.stats import entropy
from tqdm import tqdm

###### Creating Tokenized Column

In [None]:
# Creating empty column and assining tokens iteratively.

df["tokenized"] = np.nan
for index in (range(len(df["description"]))):
  df["tokenized"][index] = pre_process(df["description"][index])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


#### Using new column tokenized, finding the most common 5000 tokens.

In [None]:
tokens_list = []

for index in range(len(df["tokenized"])):
  tokens = df["tokenized"][index].split()
  for token in tokens:
    tokens_list.append(token)

len(tokens_list)

#### Obtained token list including 87385 tokens. There might be duplicate tokens in this list.

In [None]:
from  collections import Counter

# Counting the tokens list and building the most common 5000 tokens.
counted_tokens = Counter(tokens_list)
most_common_5000 = counted_tokens.most_common(5000) # list of nested tuples 

# Converting list of nested tuples to list including only tokens and not the frequencies.
most_common_5000_tokens = []
for index in range(len(most_common_5000)):
  most_common_5000_tokens.append(most_common_5000[index][0])

In [None]:
# Filtering uncommon words from column values
most_common_5000_tokens_set  = set(most_common_5000_tokens)

for index in range(len(df["tokenized"])):

  # splitting the tokens in each row of df
  tokens = df["tokenized"][index].split()
  #tokens = set(tokens)
  
  #filtered_tokens = most_common_5000_tokens_set & tokens

  # replacing column values with the new column values
  df["tokenized"][index] = list(tokens)#(filtered_tokens)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [None]:
relev = Counter(list(df["relevancy"]))
relev

Counter({0: 1166, 1: 444, 2: 390})

In [None]:
df.tail()

Unnamed: 0,name,description,country,founding_date,relevancy,tokenized
1995,James Fisher and Sons plc,"James Fisher and Sons plc is a leading provider of specialist services to the marine, oil and gas and other high assurance industries worldwide.",United Kingdom,1847-01-01,0,"[jame, fisher, son, plc, lead, provid, specialist, servic, marin, oil, ga, high, assur, industri, worldwid]"
1996,AEG,"The AEG brand offers a full range of products that continue the proud history of the brand. A track record which started with electric light bulbs evolved over the years to include everything from cars, trains, power tools and electric machines to instruments, nuclear power, motors, microelectronics and more. The brand is as attractive and relevant today as it was over 120 years ago.",United States,1887-01-01,1,"[aeg, brand, offer, full, rang, product, continu, proud, histori, brand, track, record, start, electr, light, bulb, evolv, year, includ, everyth, car, train, power, tool, electr, machin, instrument, nuclear, power, motor, microelectron, brand, attract, relev, today, year, ago]"
1997,mov.e,Digital platform that enables sharing of electricity for electric vehicles charging,Portugal,2018-01-01,2,"[digit, platform, enabl, share, electr, electr, vehicl, charg]"
1998,ambiHome,"ambiHome is a fresh company based in Aachen/Germany, that has developed an innovative KNX-home-automation system for residential housing construction, particularly for new buildings and structural restoration. The ‘intelligent house’ unifies all single installations (e. g. lighting, heating, shading, alarming, fire detection) into one sole system and thereby offers more comfort, security and energy-efficiency to the user.",Germany,2009-01-01,0,"[ambihom, fresh, compani, base, aachengermani, develop, innov, knxhomeautom, system, residenti, hous, construct, particularli, new, build, structur, restor, intellig, hous, unifi, singl, instal, light, heat, shade, alarm, fire, detect, one, sole, system, therebi, offer, comfort, secur, energyeffici, user]"
1999,Construirbarato,"Construirbarato.com.br is an online platform that allows its users to search and compare prices of products and services related to construction. It was launched in 2009 and is based in Rio de Janeiro, Brazil.",Brazil,2009-01-01,1,"[construirbaratocombr, onlin, platform, allow, user, search, compar, price, product, servic, relat, construct, launch, base, rio, de, janeiro, brazil]"


In [None]:
df.dtypes

name             object
description      object
country          object
founding_date    object
relevancy         int64
tokenized        object
dtype: object

In [None]:
data_to_train = df["tokenized"].astype("string")

In [None]:
data_to_train.head()

0    ['welcom', 'pandora', 'car', 'rental', 'car', 'hire', 'airport', 'transfer', 'base', 'dalaman', 'turkey', 'wide', 'rang', 'car', 'suit', 'budget', 'deliv', 'car', 'free', 'anytim', 'day', 'night', 'within', 'dalaman', 'local', 'reason', 'book', 'car', 'pandora', 'car', 'rental', 'unlimit', 'milag', 'vat', 'local', 'tax', 'airport', 'servic', 'charg', 'applic', 'hour', 'road', 'servic', 'third', 'parti', 'insur', 'excess', 'theft', 'insur', 'excess', 'fire', 'insur', 'excess', 'fdw', 'insur', 'excess', 'cdw', 'collis', 'damag', 'waiver', 'excess', 'twh', 'tyre', 'windscreen', 'headlight', 'insur', 'excess', 'addit', 'driver', 'childbabi', 'seat', 'must', 'order', 'hidden', 'extra', 'address', 'hadrian', 'flat', 'number', 'wellington', 'telford', 'pin', 'code', 'tfrq', 'tel', 'websit']
1                                                                                                                                                                                                       

LDA training

In [None]:
num_topics = 10
chunksize = 5

In [None]:
# Training
trained_model, dictio, corps = train_lda(data=df, num_topics=num_topics, chunksize=chunksize)

  diff = np.log(self.expElogbeta)


In [None]:
[x for x in trained_model[corps]][0], [trained_model.id2word[x[0]] for x in corps[0]][:3], len(corps)

([(1, 0.049503215), (2, 0.3310263), (6, 0.19026472), (9, 0.42778063)],
 ['addit', 'address', 'airport'],
 2000)

#### Obtaining Document-Matrix and padding it

In [None]:
docmat = ([x for x in trained_model.get_document_topics(trained_model[corps])])
docmat_pad = matpad(docmat, num_topics)

  if __name__ == '__main__':
  if __name__ == '__main__':


In [None]:
docmat_pad.shape

(2000, 10)

Vahanalytics Part

In [None]:
dictionary = corpora.Dictionary(df['tokenized'])
vahana_ind = df.index[df['name'] == "Vahanalytics"].tolist()[0]

new_bow_Vahanalytics = dictionary.doc2bow(df.loc[vahana_ind]["tokenized"])
new_doc_Vahanalytics = trained_model.get_document_topics(bow=new_bow_Vahanalytics)

In [None]:
vahana_query = matpad([new_doc_Vahanalytics], num_topics)[0]
vahana_query

array([0.00243902, 0.49024391, 0.00243902, 0.00243902, 0.00243902,
       0.00243902, 0.00243902, 0.00243902, 0.00243902, 0.49024391])

Much Asphalt Part

In [None]:
dictionary_asphalt = corpora.Dictionary(df['tokenized'])
asphalt_ind = df.index[df['name'] == "Much Asphalt"].tolist()[0]

new_bow_Asphalth = dictionary_asphalt.doc2bow(df.loc[asphalt_ind]["tokenized"])
new_doc_Asphalt = trained_model.get_document_topics(bow=new_bow_Asphalth)

In [None]:
asphalt_query = matpad([new_doc_Asphalt], num_topics)[0]
dictionary_asphalt = corpora.Dictionary(df['tokenized'])

#### Similarities Computation
Vahanalytics Part

In [None]:
most_sim_ids_vahana = get_top_k_similar_docs(vahana_query, docmat_pad, num_topics, 2000, k=5)
df.iloc[695]["description"]

'Vahanalytics aims to create better drivers and safer roads by using cutting edge big data and machine learning techniques.'

In [None]:
df[df.index.isin(most_sim_ids_vahana)][["name", "description"]]

Unnamed: 0,name,description
163,MRM Risk Management,"MRM specializes in the evaluation, development, and implementation of wrap-up insurance for large public and private construction programs. They help owners and general contractors achieve simplicity, savings and increased safety for their projects. MRM began as a wrap-up consulting firm but quickly expanded its services to include comprehensive oversight management and then full wrap-up administration in response to their clients' requests and needs."
1038,Koninklijke Mosa,"Koninklijke Mosa is a manufacturer of ceramic tiles. Royal Mosa is a Dutch manufacturer of ceramic tiles. Its factories in Maastricht produce 6 million square metres of wall and floor tiles per year, all destined for Europe, North America, the Middle East and Asia. In its home market, the Netherlands, Royal Mosa is the market leader. The company offers a wide range of tiles in many sizes, colours and designs, both for inside and outside. Royal Mosa works closely with architects and interior designers."
1476,DynaRoad,"DynaRoad provides project management software solutions for heavy construction projects such as civil engineering and infrastructure construction (highways, tunnels, railways, area development, roads, and harbours earthworks). Key features include mass haul optimization, location-based scheduling, production control, and project execution monitoring and control. Provides several graphical views such as Gantt chart, resource graph, mass haul diagram, map view, and time-location chart."
1655,"Steven M. Sweat, APC",Personal injury law firm based in Los Angeles and serving all of the California. Emphasis in catastrophic bodily harm and wrongful death claims related to the following: Motor Vehicle Accident Claims - Car Accidents - Motorcycle Collisions - Bicycle Mishaps - Pedestrian Incidents Injuries Sustained on Commercial or Residential Property - Slip and Falls - Trip and Falls - Assault and Battery - Negligent Security (Bars/Hotels/Restaurants) Sexual Assault and Abuse - Churches - Schools and other Educational Institutions - Community Organizations
1987,Equipment One Stop,"Equipment One Stop, powered by Reliance Commercial Finance, is an online marketplace for buying and selling construction equipment. Has the option to finance and insure one's equipment. Also, offers valuation calculation reports."


Much Asphalt Part

In [None]:
most_sim_ids_asphalt = get_top_k_similar_docs(asphalt_query, docmat_pad, num_topics, 2000, k=5)
df.iloc[4]["description"]

'Much Asphalt is southern Africa’s commercial supplier of an extensive range of hot and cold asphalt products to the road construction economy. Much Asphalt owns and operates 15 static plants in the major centres of South Africa and is the majority shareholder in East Coast Asphalt which operates two more in East London and Mthatha.'

In [None]:
df[df.index.isin(most_sim_ids_asphalt)][["name", "description"]]

Unnamed: 0,name,description
135,Mover,MOVER is a mobile city service for finding and selecting trucks and integrated crossings. The first mobile service that provides services for the organization of complex crossings and cargo transportation in Moscow and the Moscow region.
858,Flinders Group,"Flinders Group provides planning and project management services within water, energy, transportation, environment, telecommunications, property, mining, building, and social infrastructures. It offers services in the areas, such as project management, environment and approvals, land management, land access negotiations and management training courses, property and planning, communications and stakeholder engagements, and indigenous areas."
1094,Premier Logistics Partners,"Premier Logistics Partners is a progressive leader in the logistics industry in the United States. Premier Logistics Partners' core business is procuring, negotiating, and managing less-than-truckload (LTL) transportation for its clients."
1095,trans-o-flex Belgium BVBA,trans-o-flex Belgium BVBA provides business to business logistic services and ships parcels and pallets.
1176,Scotshield Fire & Security,"Scotshield Fire & Security is a leading UK independent service provider for all Fire Safety, Life Safety and Electronic Security Systems. Established in 1989 and with 25 years industry experience, Scotshield Fire & Security is recognised as a market leading specialist in the fire and security systems industry where we have gained a solid reputation for high standards of service delivery nationwide."
