In [1]:
import pandas as pd
import numpy as np

In [2]:
company = pd.read_csv('Company Descriptions.csv')
company

Unnamed: 0,company_name,company_short_description,company_description
0,Codementor,Codementor is an online marketplace connecting...,Codementor provides live 1:1 help for software...
1,AgShift,AgShift is designing world's most advanced aut...,AgShift solution blends Deep Learning with Com...
2,Shipsi,Shipsi empowers any retailer with the ability ...,Shipsi empowers any retailer with the ability ...
3,OpenNews,"OpenNews helps a global network of developers,...","We're helping a global network of developers, ..."
4,Biobot Analytics,Biobot Analytics analyzes city sewage to estim...,Biobot Analytics analyzes sewage to estimate o...
...,...,...,...
19960,Powermat Technologies,Powermat Technologies develops wireless energy...,Powermat Technologies is a developer of wirele...
19961,Properly,The Ultimate Turnover Tool for Vacation Rental...,Properly is a visual checklist tool that lets ...
19962,Bid Ops,Bid Ops accelerates business partnerships betw...,
19963,Tavolo,Tavolo is an online retailer and destination f...,Tavolo offers an online store that enables its...


In [3]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19965 entries, 0 to 19964
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   company_name               19965 non-null  object
 1   company_short_description  19965 non-null  object
 2   company_description        19237 non-null  object
dtypes: object(3)
memory usage: 468.1+ KB


In [4]:
# importing missing value in company_description with short decription
company.company_description = company.company_description.fillna(company.company_short_description)

In [5]:
company.isnull().sum()

company_name                 0
company_short_description    0
company_description          0
dtype: int64

In [6]:
company

Unnamed: 0,company_name,company_short_description,company_description
0,Codementor,Codementor is an online marketplace connecting...,Codementor provides live 1:1 help for software...
1,AgShift,AgShift is designing world's most advanced aut...,AgShift solution blends Deep Learning with Com...
2,Shipsi,Shipsi empowers any retailer with the ability ...,Shipsi empowers any retailer with the ability ...
3,OpenNews,"OpenNews helps a global network of developers,...","We're helping a global network of developers, ..."
4,Biobot Analytics,Biobot Analytics analyzes city sewage to estim...,Biobot Analytics analyzes sewage to estimate o...
...,...,...,...
19960,Powermat Technologies,Powermat Technologies develops wireless energy...,Powermat Technologies is a developer of wirele...
19961,Properly,The Ultimate Turnover Tool for Vacation Rental...,Properly is a visual checklist tool that lets ...
19962,Bid Ops,Bid Ops accelerates business partnerships betw...,Bid Ops accelerates business partnerships betw...
19963,Tavolo,Tavolo is an online retailer and destination f...,Tavolo offers an online store that enables its...


In [7]:
company = company.drop(['company_short_description'],axis=1)

In [8]:
company.head()

Unnamed: 0,company_name,company_description
0,Codementor,Codementor provides live 1:1 help for software...
1,AgShift,AgShift solution blends Deep Learning with Com...
2,Shipsi,Shipsi empowers any retailer with the ability ...
3,OpenNews,"We're helping a global network of developers, ..."
4,Biobot Analytics,Biobot Analytics analyzes sewage to estimate o...


# Text preprocessing

In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def remove_punctuation(text):
    text_nopunctuation="".join([c for c in text if c not in string.punctuation])
    return text_nopunctuation

In [11]:
company.company_description = company.company_description.apply(remove_punctuation)

In [12]:
def remove_digit(text):
    result = ''.join([i for i in text if not i.isdigit()])
    return result

In [13]:
company.company_description = company.company_description.apply(remove_digit)

In [14]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
company.company_description = company.company_description.apply(lambda x: tokenization(x))

In [15]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
company.company_description = company.company_description.apply(lambda x:remove_stopwords(x))

In [16]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
company.company_description = company.company_description.apply(lambda x:lemmatizer(x))

In [17]:
company.company_description = company.company_description.apply(lambda x: ' '.join(x))

In [18]:
company.company_description[0]

'Codementor provides live  help for software development   e’re making it easy for developers to connect with experts via screen sharing video and chat  There are two ways to get help ondemand live  expert help and longterm dedicated mentorship \n\nBuild projects faster with ondemand help for topics including Ruby Python PHP JavaScript CSSHTML iOS Swift and more Codementor helps you overcome key challenges with timely advice and speeds up your development process'

In [110]:
data = company.company_description

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
data_vectorized = vectorizer.fit_transform(company.company_description).toarray()

In [100]:
data_vectorized

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.20424558, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
vectors.shape

(19965, 5000)

In [101]:
from sklearn.decomposition import LatentDirichletAllocation
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model) 

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [102]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
print(lda_model.get_params())

Log Likelihood:  -926836.4580800007
Perplexity:  8221.57278059275
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 100, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [105]:
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [10], 'learning_decay': [.9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.9], 'n_components': [10]})

In [106]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -190757.2490990397
Model Perplexity:  5933.266581544447


In [111]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [112]:
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.86,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0
Doc1,0.87,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
Doc2,0.84,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0
Doc3,0.85,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0
Doc4,0.86,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0
...,...,...,...,...,...,...,...,...,...,...,...
Doc19960,0.90,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
Doc19961,0.83,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0
Doc19962,0.76,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc19963,0.02,0.02,0.02,0.02,0.02,0.46,0.02,0.02,0.37,0.02,5


In [113]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()



Unnamed: 0,ability,ablation,able,abuse,academic,academy,accel,accelerate,accelerated,accelerates,...,youll,young,youre,youth,youtube,youve,zealand,zero,zillow,zone
Topic0,48.057266,0.115022,37.475872,4.135742,9.893876,5.735207,9.840492,37.170576,5.971418,13.126607,...,4.530909,13.768546,6.226723,4.046574,11.17242,1.716396,4.550719,13.273277,2.815526,3.946323
Topic1,0.108891,0.106371,0.10618,0.107704,0.107368,0.106474,0.107895,0.107262,0.106466,0.105165,...,0.10736,0.107553,0.107859,0.106923,0.107817,0.106342,0.108415,0.107585,0.108327,0.106177
Topic2,0.118408,0.107628,0.108025,0.113463,0.106359,0.107275,0.108941,0.105494,0.107293,0.107131,...,0.107513,0.108616,0.107406,0.107674,0.107363,0.108079,0.107652,0.10617,0.109136,0.106901
Topic3,0.108796,0.107175,0.108232,0.107616,0.108144,0.106182,0.108737,0.106169,0.108107,0.108868,...,0.112083,0.108487,0.107601,0.107381,0.107136,0.106232,0.106054,0.107129,0.108028,0.107937
Topic4,0.106523,0.107942,0.107021,0.106156,0.106547,0.107507,0.106359,0.108625,0.106673,0.107338,...,0.107366,0.106948,0.107055,0.107506,0.106482,0.106842,0.106122,0.108124,0.107551,0.108036


In [114]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords



Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,data,company,platform,solutions,services,technology,software,provides,management,mobile,founded,business,service,products,based
Topic 1,parts,power,automation,consumers,prove,pricing,users,blocks,clients,jobs,cost,small,robotics,process,enabled
Topic 2,milk,boom,solar,pv,powered,park,elearning,organic,photovoltaic,parties,cells,electricity,box,networks,delicious
Topic 3,collective,nimble,hiring,interview,quality,venture,coatings,laboratory,company,board,tests,ideas,recruiting,app,healthcare
Topic 4,video,editing,instructors,contractors,edit,interior,best,designer,conversation,louis,predictive,award,luxury,inner,sales
Topic 5,music,friends,share,users,people,app,photos,videos,online,food,social,discover,favorite,shopping,kids
Topic 6,wine,seafood,wines,drinks,ocean,arbor,ann,delight,alternatives,photonics,michigan,makes,foods,sustainable,products
Topic 7,foods,organic,fuels,chemicals,carbon,optical,ingredients,renewable,oil,food,protein,chocolate,delicious,biomass,biofuels
Topic 8,company,medical,treatment,therapeutics,development,patients,diseases,developing,clinical,cancer,disease,drug,novel,based,develops
Topic 9,hearing,aviation,specialist,noise,broadcasting,drugs,sound,modeling,discovers,kubernetes,proteins,newly,properties,treat,emerging


In [202]:
Topics = [data[:11]]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

ValueError: Length of values (1) does not match length of index (10)

In [203]:
data

0        Codementor provides live  help for software de...
1        AgShift solution blends Deep Learning with Com...
2        Shipsi empowers any retailer with the ability ...
3         ere helping a global network of developers jo...
4        Biobot Analytics analyzes sewage to estimate o...
                               ...                        
19960    Powermat Technologies is a developer of wirele...
19961    Properly is a visual checklist tool that lets ...
19962    Bid Ops accelerates business partnerships betw...
19963    Tavolo offers an online store that enables its...
19964    StackMob’s mobile platform helps developers cr...
Name: company_description, Length: 19965, dtype: object

In [156]:
# import spacy
# # Define function to predict topic for a given text document.
# nlp = spacy.load("en_core_web_sm")
def predict_topic(text):
    text_vec = vectorizer.transform(text)
    topic_probability_scores = best_lda_model.transform(text_vec)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), 1:14].values.tolist()
    
    # Step 5: Infer Topic
    infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), -1]
    
    #topic_guess = df_topic_keywords.iloc[np.argmax(topic_probability_scores), Topics]
    return infer_topic, topic, topic_probability_scores

In [162]:
# Predict the topic
mytext = ["Very Useful in diabetes age 30. I need control sugar. thanks Good deal"]
infer_topic, topic, prob_scores = predict_topic(text = mytext)
print(topic)
print(infer_topic)

['company', 'platform', 'solutions', 'services', 'technology', 'software', 'provides', 'management', 'mobile', 'founded', 'business', 'service', 'products']
Update Version/Fix Crash Problem


In [176]:
def apply_predict_topic(text):
    
    text = [text]
    infer_topic, topic, prob_scores = predict_topic(text)
    return(infer_topic)
company["Topic_key_word"]= company.company_description.apply(apply_predict_topic)


In [178]:
data[2]

'Shipsi empowers any retailer with the ability to offer an easy “instant shipping” option on their checkout page—without worrying about logistics They use existing infrastructure and lastmile delivery networks to help customers order now AND get it now\n\nShipsi was founded in  and is headquartered in California USA'

In [29]:
industry = pd.read_excel("Industry Segments - Top 10 Keywords.xlsx")
industry

Unnamed: 0,Industry segment,Tags
0,Aerospace and defense,"security, systems, video, surveillance, servic..."
1,Agriculture and forestry,"Service, cleantech, water, agriculture, traits..."
2,Biopharmaceuticals,"Developer, treatment, drug, diseases, technolo..."
3,Business support services,"Service, platform, online, management, data, m..."
4,Communications and networking,"Service, wireless, network, data, internet, ap..."
5,Construction and civil engineering,"Service, cleantech, water, energy, waste, trea..."
6,Consumer information services,"Online, users, web, service, platform, social,..."
7,Electronics and computer hardware,"Technology, storage, energy, systems, applicat..."
8,Financial institutions and services,"Service, financial, payment, online, platform,..."
9,Food and beverage,"Food, organic, tea, beverages, ingredients, na..."


In [30]:
industry['Tags'][26]

'Transportation, management, shipping, service, containers, container, freight, online, sales, cargo'

In [31]:
def remove_space(lists):
    L = []
    for i in lists:
        L.append(i.strip())
        
    return L

In [32]:
industry['Tags'] = industry['Tags'].apply(lambda x: x.split(','))
industry.head()

Unnamed: 0,Industry segment,Tags
0,Aerospace and defense,"[security, systems, video, surveillance, s..."
1,Agriculture and forestry,"[Service, cleantech, water, agriculture, t..."
2,Biopharmaceuticals,"[Developer, treatment, drug, diseases, tec..."
3,Business support services,"[Service, platform, online, management, da..."
4,Communications and networking,"[Service, wireless, network, data, interne..."


In [33]:
industry['Tags'] = industry['Tags'].apply(remove_space)
industry.head()

Unnamed: 0,Industry segment,Tags
0,Aerospace and defense,"[security, systems, video, surveillance, servi..."
1,Agriculture and forestry,"[Service, cleantech, water, agriculture, trait..."
2,Biopharmaceuticals,"[Developer, treatment, drug, diseases, technol..."
3,Business support services,"[Service, platform, online, management, data, ..."
4,Communications and networking,"[Service, wireless, network, data, internet, a..."


In [34]:
from sklearn.cluster import KMeans

In [35]:
km = KMeans(n_clusters=27)
y_mean = km.fit_predict(vectors)

In [196]:
company['cluster_number'] = y_mean

In [201]:
comapny

Unnamed: 0,company_name,company_description,cluster_number
0,Codementor,Codementor provides live help for software de...,11
1,AgShift,AgShift solution blends Deep Learning with Com...,0
2,Shipsi,Shipsi empowers any retailer with the ability ...,0
3,OpenNews,ere helping a global network of developers jo...,0
4,Biobot Analytics,Biobot Analytics analyzes sewage to estimate o...,21
...,...,...,...
19960,Powermat Technologies,Powermat Technologies is a developer of wirele...,20
19961,Properly,Properly is a visual checklist tool that lets ...,0
19962,Bid Ops,Bid Ops accelerates business partnerships betw...,0
19963,Tavolo,Tavolo offers an online store that enables its...,13
