##### Install libraries:

In [None]:
!pip3 install transformers -U

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.2


##### Import libraries:

In [None]:
import pandas as pd
from tqdm import tqdm
import requests

from transformers import pipeline
from textblob import TextBlob
import nltk

import gc
import torch
import time
import warnings

from nltk import tokenize

##### Import vocabularies:

In [None]:
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##### Functions

In [None]:
def gpt_reply(txt):

  request_json = {"message": "'" + txt + "'" + " - give list of technologies, mentioned in this text (quotes, seperated by comma)", "api_key": CHAD_API_KEY}

  try:
    response = requests.post(url='https://ask.chadgpt.ru/api/public/gpt-3.5', json=request_json)
    resp_json = response.json()
    resp_msg = resp_json['response']
  except:
    resp_msg = 'Error'

  return ', '.join(list(TextBlob(resp_msg).noun_phrases))

def return_counts(txt):
  try:
    counts = len(df[df['Description'].str.contains(txt)])
  except:
    counts = 0
  return counts

def extract_technology_qa(text, pipe):

  z = [r['answer'].replace('.', '') for r in pipe_qa({'question': 'List of technologies, mentioned in this text', 'context': text}, top_k=3)]
  z = ', '.join(list(frozenset(list(TextBlob(', '.join(z)).noun_phrases))))

  return z

def extract_technology_phi(text, pipe):

  prompt = 'What technologies are mentioned in this text\n\nText: ' + text + '\n\nOutput: '
  z = pipe(prompt, temperature=.2, num_beams=5, max_new_tokens=50)[0]['generated_text'].replace(prompt, '')
  z = ', '.join(list(frozenset(list(TextBlob(z).noun_phrases))))

  return z

##### Parameters:

In [None]:
CHAD_API_KEY = 'chad-1eb4a0fe9eb54db4840e4fe4cfe18b9ansmtfcom'

##### Supress warnings:

In [None]:
warnings.filterwarnings('ignore')

##### Filname:

In [None]:
filename = 'bio.csv'

##### Show number of companies:

In [None]:
df = pd.read_csv(filename)
df = df[df['Operating Status']=='Active']
print('Companies number:', len(df))

Companies number: 843


##### Test GPT:

In [None]:
gpt_reply('StartupX implements Machine Learning in our new product MarkZ for Quantum Computing laboratories')

'machine learning, quantum computing'

##### Test Q&A:

In [None]:
pipe_qa = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2', trust_remote_code=True, device='cuda:0')
print(extract_technology_qa('StartupX implements Machine Learning in our new product for Quantum Computing laboratories, MarkZ', pipe_qa))

del pipe_qa
torch.cuda.empty_cache()
gc.collect()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

machine learning, startupx, quantum computing, new product


24

##### Test PHI:

In [None]:
pipe_phi = pipeline('text-generation', model='microsoft/phi-2', tokenizer='microsoft/phi-2', trust_remote_code=True, device='cuda:0', pad_token_id=50256)
print(extract_technology_phi('StartupX implements Machine Learning in our new product for Quantum Computing laboratories, MarkZ', pipe_phi))

del pipe_phi
torch.cuda.empty_cache()
gc.collect()

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


machine learning, quantum computing


36

##### Extract technologies by GPT:

In [None]:
%%time

df = pd.read_csv(filename)
df = df[df['Operating Status']=='Active']

df['Founded Date'] = pd.to_datetime(df['Founded Date'], format='%Y-%m-%d')
df = df.sort_values(by='Founded Date', ascending=False)

df['Description'] = df['Description'] + ' ' + df['Full Description'] + ' ' + df['Industries']
df = df[df['Description'].notna()]

techs_gpt = []
for i in tqdm(list(df['Description'])):
  techs_gpt.append(gpt_reply(i))

techs_gpt = list(frozenset(', '.join(techs_gpt).replace('"', '').replace('.', '').split(', ')))

100%|██████████| 638/638 [12:29<00:00,  1.18s/it]

CPU times: user 45.2 s, sys: 1.17 s, total: 46.4 s
Wall time: 12min 29s





##### Extract technologies by squad2:

In [None]:
df = pd.read_csv(filename)
df = df[df['Operating Status']=='Active']

df['Founded Date'] = pd.to_datetime(df['Founded Date'], format='%Y-%m-%d')
df = df.sort_values(by='Founded Date', ascending=False)

df['Description'] = df['Description'] + ' ' + df['Full Description'] + ' ' + df['Industries']
df = df[df['Description'].notna()]

pipe_qa = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2', trust_remote_code=True, device='cuda:0')

techs_squad = []
for i in tqdm(list(df['Description'])):
  techs_squad.append(extract_technology_qa(i, pipe_qa))

techs_squad = list(frozenset(', '.join(techs_squad).replace('"', '').replace('.', '').split(', ')))

del pipe_qa
torch.cuda.empty_cache()
gc.collect()

100%|██████████| 638/638 [00:10<00:00, 59.17it/s]


0

##### Extract technologies by phi2:

In [44]:
%%time

df = pd.read_csv(filename)
df = df[df['Operating Status']=='Active']

df['Founded Date'] = pd.to_datetime(df['Founded Date'], format='%Y-%m-%d')
df = df.sort_values(by='Founded Date', ascending=False)

df['Description'] = df['Description'] + df['Industries']
df = df[df['Description'].notna()]

pipe_phi = pipeline('text-generation', model='microsoft/phi-2', tokenizer='microsoft/phi-2', trust_remote_code=True, device='cuda:0', pad_token_id=50256)

techs_phi = []
for i in tqdm(list(df['Description'])):

  techs_phi.append(extract_technology_phi(i, pipe_phi))
  torch.cuda.empty_cache()
  gc.collect()

techs_phi = list(frozenset(', '.join(techs_phi).replace('"', '').replace('.', '').split(', ')))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 843/843 [40:29<00:00,  2.88s/it]

CPU times: user 39min 58s, sys: 51.5 s, total: 40min 50s
Wall time: 40min 35s





##### Merge lists of technologies:

In [46]:
techs = list(frozenset(techs_gpt + techs_squad + techs_phi))
df_techs = pd.DataFrame(techs, columns=['Techs'])

##### Lower case descriptions:

In [47]:
df = pd.read_csv(filename)
df = df[df['Operating Status']=='Active']

df['Founded Date'] = pd.to_datetime(df['Founded Date'], format='%Y-%m-%d')
df = df.sort_values(by='Founded Date', ascending=False)

df['Description'] = df['Description'] + ' ' + df['Full Description'] + ' ' + df['Industries']
df = df[df['Description'].notna()]

df['Description'] = df['Description'].str.lower()

##### Find technology occurences:

In [48]:
tqdm.pandas()
df_techs['Counts'] = df_techs['Techs'].progress_apply(lambda x: return_counts(x))
df_techs = df_techs.sort_values(by='Counts', ascending=False)

100%|██████████| 3224/3224 [00:04<00:00, 652.70it/s]


##### Show dataframe:

In [49]:
pd.set_option("display.max_rows", None)
df_techs

Unnamed: 0,Techs,Counts
0,,638
606,re,621
169,ca,612
1583,it,524
1447,ent,492
1521,care,468
1129,vi,446
1820,health care,434
1951,ct,407
207,iv,401


##### Save to xlsx:

In [50]:
df_techs.to_excel('trends.xlsx')

##### Show noun phrases:

In [51]:
df['Nouns'] = df['Description'].progress_apply(lambda x: ', '.join(list(TextBlob(x).noun_phrases)))
nouns = ', '.join(list(df['Nouns'])).split(', ')
nouns = list(frozenset(nouns))

df_nouns = pd.DataFrame(nouns, columns=['Nouns'])
df_nouns['Counts'] = df_nouns['Nouns'].progress_apply(lambda x: return_counts(x))
df_nouns = df_nouns.sort_values(by='Counts', ascending=False)

df_nouns

100%|██████████| 638/638 [00:01<00:00, 511.56it/s]
100%|██████████| 5351/5351 [00:08<00:00, 642.41it/s]


Unnamed: 0,Nouns,Counts
2204,|hub hosts 1000+ business partners,638
3384,health care,434
1631,artificial intelligence,96
2259,information technology,74
5272,mental health,52
3450,medical device,50
1176,personal health,46
997,machine learning,45
665,life science,42
2910,health diagnostics,41


##### Save to xls:

In [52]:
df_nouns.to_excel('nouns.xlsx')