## Code for visualizing skills as a graph

In [None]:
import requests
import itertools
import time

# scraping vacancies json from hh.ru
ses = requests.Session()
ses.headers = {'HH-User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0"}

url = 'https://api.hh.ru/vacancies?text=python&per_page=100'
res = ses.get(url)

res_all = []
for p in range(res.json()['pages']):
    time.sleep(1)
    print(f'scraping page {p}')
    url = f'https://api.hh.ru/vacancies?text=python&per_page=100&page={p}' 
    res = ses.get(url)
    res_all.append(res.json())


In [None]:
# parcing vacancies ids, getting vacancy responce and scraping tags from each vacancy
tags_list = []

for page_res_json in res_all:
    for i in range(page_res_json['per_page']):
        vac_id = page_res_json['items'][i]['id']
        vac_res = ses.get(f'https://api.hh.ru/vacancies/{vac_id}')

        if len(vac_res.json()["key_skills"]) > 0:  # at least one skill present
            print(vac_id)
            tags = [v for v_dict in vac_res.json()["key_skills"] for _, v in v_dict.items()]
            print(' '.join(tags))
            tags_list.append(tags)
            print()

        time.sleep(0.1)  # not to overload server 

In [None]:
print(res.json()['page'])

In [None]:
flattened_list = [i for line in tags_list for i in line]

# some filtering by occurences count
flattened_list = [x for x in flattened_list if flattened_list.count(x) > 10]

# counting words occurances
words_count = {i:flattened_list.count(i) for i in set(flattened_list)}
print(words_count)


# tags connection dict initialization
formatted_tags = {}
for tag1 in set(flattened_list):
    for tag2 in set(flattened_list):
        formatted_tags[(tag1, tag2)] = 0 

        
# count tags connection
for line in tags_list:
    for tag1, tag2 in itertools.product(line, repeat=2):
        if (tag1, tag2) in formatted_tags:
            formatted_tags[(tag1, tag2)] += 1 
            
            
# filtering data from zero occurances
for k, v in formatted_tags.copy().items():
    if v == 0 or ('Python' in k):
        del formatted_tags[k]
            
for k,v in formatted_tags.items():
    print(k,v)

In [None]:
import matplotlib.pyplot as plt
import networkx as nx


G=nx.Graph()

G.add_edges_from(list(formatted_tags.keys()))
pos = nx.spring_layout(G, k=0.5, iterations=200)
e_widths = [i/3 for i in formatted_tags.values()]
n_widths = [words_count[i]*10 for i in list(G.nodes())]

f = plt.figure(figsize=(32,32))

nx.draw_networkx_nodes(G, pos, node_color='#A0CBE2', node_size=n_widths, node_cmap=plt.cm.Blues)
nx.draw_networkx_edges(G, pos, edge_color='#C0CBD2', edgelist=list(formatted_tags.keys()), width=e_widths, edge_cmap=plt.cm.Blues)
nx.draw_networkx_labels(G, pos)

plt.show()


In [None]:
import pickle
with open('formatted_tags.pkl', 'wb') as f:
    pickle.dump(formatted_tags, f)

In [None]:
f.savefig("tags_graph.png", format="PNG")