In [1]:
import networkx as nx
from ipysigma import Sigma

# Importing a gexf graph
g = nx.read_gexf('dataset.gexf')

In [2]:
# Displaying the graph with a size mapped on degree and
# a color mapped on a categorical attribute of the nodes
Sigma(g, node_size=g.degree, node_color='category')

Sigma(nx.DiGraph with 2,085 nodes and 5,409 edges)

In [6]:
g.nodes['cytoscape']

{'tag': 'Tool',
 'URL': 'https://en.wikipedia.org/wiki/Cytoscape',
 'cluster': 'Graph theory',
 'score': '0.00006909602204225056',
 'label': 'Cytoscape'}

# DATOS ONET (Abilities, Skills, Knowledge)

In [52]:
import os 
import pandas as pd 

## Definimos rutas donde se encuentran las correspondencias scian-ciiu
FILE_PATH = os.getcwd()
DATA_PATH = os.path.join(FILE_PATH, "data")
OUTPUT_PATH = os.path.join(FILE_PATH, "output")

ONET_DATA_PATH = os.path.join(FILE_PATH, "data", "onet", "db_28_2_text")
SKILLS_ONET_DATA_PATH = os.path.join(ONET_DATA_PATH, "Skills.txt")
KNOWLEDGE_ONET_DATA_PATH = os.path.join(ONET_DATA_PATH, "Knowledge.txt")
ABILITIES_ONET_DATA_PATH = os.path.join(ONET_DATA_PATH, "Abilities.txt")
EDUCATION_ONET_DATA_PATH = os.path.join(ONET_DATA_PATH, "Education, Training, and Experience.txt")


In [55]:
# Cargamos datos
skills = pd.read_table(SKILLS_ONET_DATA_PATH)
knowledge = pd.read_table(KNOWLEDGE_ONET_DATA_PATH)
abilities = pd.read_table(ABILITIES_ONET_DATA_PATH)
education = pd.read_table(EDUCATION_ONET_DATA_PATH)

# Filtramos datos
skills = skills[(skills["Domain Source"]=="Analyst") & (skills["Scale ID"]=="IM")]
abilities = abilities[(abilities["Domain Source"]=="Analyst") & (abilities["Scale ID"]=="IM")]
knowledge = knowledge[(knowledge["Domain Source"]=="Incumbent") & (knowledge["Scale ID"]=="IM")]
education = education[(education["Domain Source"]=="Incumbent") & (education["Scale ID"] =="RL") & (education["Category"] >=6)]

In [70]:
# Agrupamos valores
skills["onet_code"] = skills["O*NET-SOC Code"].apply(lambda x : x.split(".")[0])
abilities["onet_code"] = abilities["O*NET-SOC Code"].apply(lambda x : x.split(".")[0])
knowledge["onet_code"] = knowledge["O*NET-SOC Code"].apply(lambda x : x.split(".")[0])
education["onet_code"] = education["O*NET-SOC Code"].apply(lambda x : x.split(".")[0])

education["Element ID"] = education["Element ID"] + education["Category"].astype(int).astype(str)

knowledge_grouped = knowledge.groupby(["onet_code", "Element ID"]).agg({"Data Value" : "mean"}).reset_index()
abilities_grouped = abilities.groupby(["onet_code", "Element ID"]).agg({"Data Value" : "mean"}).reset_index()
skills_grouped = skills.groupby(["onet_code", "Element ID"]).agg({"Data Value" : "mean"}).reset_index()
education_grouped = education.groupby(["onet_code", "Element ID"]).agg({"Data Value" : "mean"}).reset_index()

skills_grouped["element_id"] = skills_grouped["Element ID"]+"_skills" 
abilities_grouped["element_id"] = abilities_grouped["Element ID"]+"_abilities" 
knowledge_grouped["element_id"] = knowledge_grouped["Element ID"]+"_knowledge"
education_grouped["element_id"] = education_grouped["Element ID"]+"_education"

In [121]:
## Concatenamos los datos
features_occupations = pd.concat([skills_grouped, abilities_grouped, knowledge_grouped, education_grouped], ignore_index=True)
#features_occupations = pd.concat([skills_grouped,education_grouped], ignore_index=True)

features_occupations

Unnamed: 0,onet_code,Element ID,Data Value,element_id
0,11-1011,2.A.1.a,4.060,2.A.1.a_skills
1,11-1011,2.A.1.b,4.000,2.A.1.b_skills
2,11-1011,2.A.1.c,4.120,2.A.1.c_skills
3,11-1011,2.A.1.d,4.125,2.A.1.d_skills
4,11-1011,2.A.1.e,3.065,2.A.1.e_skills
...,...,...,...,...
89944,53-7121,2.D.112,0.000,2.D.112_knowledge
89945,53-7121,2.D.16,1.760,2.D.16_knowledge
89946,53-7121,2.D.17,0.000,2.D.17_knowledge
89947,53-7121,2.D.18,0.000,2.D.18_knowledge


In [122]:
## Agregamos informacion del SOC code
soc_structure = pd.read_excel("data/soc_structure_2018.xlsx", skiprows=7)
soc_structure.columns = ["Major Group", "Minor Group", "Broad Group", "Detailed Occupation", "Description"]
soc_structure

Unnamed: 0,Major Group,Minor Group,Broad Group,Detailed Occupation,Description
0,11-0000,,,,Management Occupations
1,,11-1000,,,Top Executives
2,,,11-1010,,Chief Executives
3,,,,11-1011,Chief Executives
4,,,11-1020,,General and Operations Managers
...,...,...,...,...,...
1442,,,,55-3014,Artillery and Missile Crew Members
1443,,,,55-3015,Command and Control Center Specialists
1444,,,,55-3016,Infantry
1445,,,,55-3018,Special Forces


In [123]:
soc_detailed = {i:j for i,j in soc_structure.dropna(subset="Detailed Occupation")[["Detailed Occupation", "Description"]].to_records(index = False)}
soc_detailed

{'11-1011': 'Chief Executives',
 '11-1021': 'General and Operations Managers',
 '11-1031': 'Legislators',
 '11-2011': 'Advertising and Promotions Managers',
 '11-2021': 'Marketing Managers',
 '11-2022': 'Sales Managers',
 '11-2032': 'Public Relations Managers',
 '11-2033': 'Fundraising Managers',
 '11-3012': 'Administrative Services Managers',
 '11-3013': 'Facilities Managers',
 '11-3021': 'Computer and Information Systems Managers',
 '11-3031': 'Financial Managers',
 '11-3051': 'Industrial Production Managers',
 '11-3061': 'Purchasing Managers',
 '11-3071': 'Transportation, Storage, and Distribution Managers',
 '11-3111': 'Compensation and Benefits Managers',
 '11-3121': 'Human Resources Managers',
 '11-3131': 'Training and Development Managers',
 '11-9013': 'Farmers, Ranchers, and Other Agricultural Managers',
 '11-9021': 'Construction Managers',
 '11-9031': 'Education and Childcare Administrators, Preschool and Daycare',
 '11-9032': 'Education Administrators, Kindergarten through Se

In [124]:
features_occupations["onet_code_description"] = features_occupations["onet_code"].replace(soc_detailed)
features_occupations

Unnamed: 0,onet_code,Element ID,Data Value,element_id,onet_code_description
0,11-1011,2.A.1.a,4.060,2.A.1.a_skills,Chief Executives
1,11-1011,2.A.1.b,4.000,2.A.1.b_skills,Chief Executives
2,11-1011,2.A.1.c,4.120,2.A.1.c_skills,Chief Executives
3,11-1011,2.A.1.d,4.125,2.A.1.d_skills,Chief Executives
4,11-1011,2.A.1.e,3.065,2.A.1.e_skills,Chief Executives
...,...,...,...,...,...
89944,53-7121,2.D.112,0.000,2.D.112_knowledge,"Tank Car, Truck, and Ship Loaders"
89945,53-7121,2.D.16,1.760,2.D.16_knowledge,"Tank Car, Truck, and Ship Loaders"
89946,53-7121,2.D.17,0.000,2.D.17_knowledge,"Tank Car, Truck, and Ship Loaders"
89947,53-7121,2.D.18,0.000,2.D.18_knowledge,"Tank Car, Truck, and Ship Loaders"


In [125]:
### Calculamos la matriz de proximidad
# https://github.com/cid-harvard/py-ecomplexity
# pip install ecomplexity
from ecomplexity import ecomplexity
from ecomplexity import proximity

features_occupations["year"] = 2024

trade_cols = {'time':'year', 'loc':'element_id', 'prod':'onet_code', 'val':'Data Value'}
prox_df = proximity(features_occupations, trade_cols,rca_mcp_threshold = 1.5)
prox_df

2024


Unnamed: 0,year,onet_code_1,onet_code_2,proximity
1,2024,11-1011,11-1021,0.500000
2,2024,11-1011,11-2011,0.333333
3,2024,11-1011,11-2021,0.500000
4,2024,11-1011,11-2022,0.000000
5,2024,11-1011,11-3012,0.500000
...,...,...,...,...
558003,2024,53-7121,53-7065,0.266667
558004,2024,53-7121,53-7071,0.333333
558005,2024,53-7121,53-7072,0.133333
558006,2024,53-7121,53-7073,0.333333


In [126]:
## Creamos link si la proximidad es mayor a 0.8
prox_df_link = prox_df.query("proximity>0.55")

In [127]:
## Creamos la gráfica
import networkx as nx
G_onet = nx.from_pandas_edgelist(prox_df_link, "onet_code_1", "onet_code_2")
                         

In [128]:
Sigma(G_onet, node_metrics=["louvain"], node_color="louvain")

Sigma(nx.Graph with 677 nodes and 6,514 edges)

# REPETIMOS EL EJERCICIO PERO CON LOS 238 GRUPOS

In [106]:
df_recodificacion = pd.read_csv(os.path.join(OUTPUT_PATH, "recodificacion_ciiu-rev-4_scian_2018.csv"))
df_recodificacion

Unnamed: 0,codigo,codigo_nuevo,clasificador,nombre_ocupacion
0,2321,1,sinco,Profesores universitarios y de enseñanza superior
1,2715,1,sinco,Instructores y capacitadores en oficios y para...
2,25-1011,1,onet,"Business Teachers, Postsecondary"
3,25-1022,1,onet,"Mathematical Science Teachers, Postsecondary"
4,25-1021,1,onet,"Computer Science Teachers, Postsecondary"
...,...,...,...,...
1175,53-6021,236,onet,Parking Attendants
1176,47-4041,237,onet,Hazardous Materials Removal Workers
1177,9663,237,sinco,Recolectores de otros materiales
1178,9731,238,sinco,Lecturistas de medidores


In [146]:
## Obtenemos los keywords del conjunto de ocupaciones de cada grupo
## https://maartengr.github.io/KeyBERT/
#!pip install keybert
#!pip3 install rake-nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from rake_nltk import Rake

# Create a Rake instance
r = Rake()

# Text from which keywords will be extracted
text = "RAKE (Rapid Automatic Keyword Extraction) is a keyword extraction algorithm that automatically identifies relevant keywords and phrases in a text document."

# Extract keywords from the text
r.extract_keywords_from_text(text)

# Get the ranked keywords
keywords = r.get_ranked_phrases_with_scores()

# Print the extracted keywords and their scores
for score, kw in keywords:
    print("Keyword:", kw, "Score:", score)

Keyword: automatically identifies relevant keywords Score: 16.0
Keyword: rapid automatic keyword extraction Score: 15.0
Keyword: keyword extraction algorithm Score: 10.0
Keyword: text document Score: 4.0
Keyword: rake Score: 1.0
Keyword: phrases Score: 1.0


[nltk_data] Downloading package stopwords to /home/milo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/milo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/milo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [80]:
from keybert import KeyBERT

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [81]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)

[('supervised', 0.6676),
 ('labeled', 0.4896),
 ('learning', 0.4813),
 ('training', 0.4134),
 ('labels', 0.3947)]

In [82]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1,2), stop_words=None)

[('supervised learning', 0.6779),
 ('supervised', 0.6676),
 ('signal supervised', 0.6152),
 ('in supervised', 0.6124),
 ('labeled training', 0.6013)]

In [83]:
cw_keywords = {}

for i in df_recodificacion.codigo_nuevo.unique():
    doc = ",".join(df_recodificacion.query(f"clasificador=='onet' and codigo_nuevo=={i}").nombre_ocupacion)
    keywords = kw_model.extract_keywords(doc)
    ocupacion_agregada = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)[0][0]

    cw_keywords[i] = ocupacion_agregada
    

In [107]:
onet_soc_codes = df_recodificacion[df_recodificacion.codigo.apply(lambda x : "-" in x)].codigo.unique()
features_occupations = features_occupations[features_occupations.onet_code.isin(onet_soc_codes)]

In [108]:
features_occupations["onet_code_reclasificado"] = features_occupations["onet_code"].replace({i:j for i,j in df_recodificacion.query("clasificador=='onet'")[["codigo", "codigo_nuevo"]].to_records(index = False)})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_occupations["onet_code_reclasificado"] = features_occupations["onet_code"].replace({i:j for i,j in df_recodificacion.query("clasificador=='onet'")[["codigo", "codigo_nuevo"]].to_records(index = False)})


In [109]:
features_occupations_recod =features_occupations.groupby(["onet_code_reclasificado", "element_id"]).agg({"Data Value" : "mean"}).reset_index()
features_occupations_recod

Unnamed: 0,onet_code_reclasificado,element_id,Data Value
0,1,2.A.1.a_skills,4.098571
1,1,2.A.1.b_skills,4.020857
2,1,2.A.1.c_skills,3.992286
3,1,2.A.1.d_skills,4.257143
4,1,2.A.1.e_skills,2.656571
...,...,...,...
9053,238,2.D.112_knowledge,0.000000
9054,238,2.D.16_knowledge,2.390000
9055,238,2.D.17_knowledge,0.000000
9056,238,2.D.18_knowledge,0.000000


In [110]:
### Calculamos la matriz de proximidad
# https://github.com/cid-harvard/py-ecomplexity
# pip install ecomplexity
features_occupations_recod["year"] = 2024
trade_cols = {'time':'year', 'loc':'element_id', 'prod':'onet_code_reclasificado', 'val':'Data Value'}
prox_df_recod = proximity(features_occupations_recod, trade_cols,rca_mcp_threshold = 1.2)
prox_df_recod

2024


Unnamed: 0,year,onet_code_reclasificado_1,onet_code_reclasificado_2,proximity
1,2024,1,2,0.000000
2,2024,1,3,0.000000
3,2024,1,4,0.000000
4,2024,1,5,0.000000
5,2024,1,6,0.000000
...,...,...,...,...
47955,2024,238,232,0.476190
47956,2024,238,233,0.571429
47957,2024,238,235,0.666667
47958,2024,238,236,0.590909


In [111]:
## Cambiamos códigos por nombres
prox_df_recod["onet_code_reclasificado_1"] = prox_df_recod["onet_code_reclasificado_1"].replace(cw_keywords)
prox_df_recod["onet_code_reclasificado_2"] = prox_df_recod["onet_code_reclasificado_2"].replace(cw_keywords)
prox_df_recod

Unnamed: 0,year,onet_code_reclasificado_1,onet_code_reclasificado_2,proximity
1,2024,postsecondary teachers,farmworkers and,0.000000
2,2024,postsecondary teachers,meat cutters,0.000000
3,2024,postsecondary teachers,service attendants,0.000000
4,2024,postsecondary teachers,technicians except,0.000000
5,2024,postsecondary teachers,avionics technicians,0.000000
...,...,...,...,...
47955,2024,meter readers,ship engineers,0.476190
47956,2024,meter readers,dredge operators,0.571429
47957,2024,meter readers,passenger attendants,0.666667
47958,2024,meter readers,parking attendants,0.590909


In [112]:
## Creamos link si la proximidad es mayor a 0.5
prox_df_link_recod = prox_df_recod.query("proximity>0.6")
prox_df_link_recod

Unnamed: 0,year,onet_code_reclasificado_1,onet_code_reclasificado_2,proximity
35,2024,postsecondary teachers,education administrators,0.800000
65,2024,postsecondary teachers,engineering managers,0.800000
68,2024,postsecondary teachers,music directors,0.666667
89,2024,postsecondary teachers,medical scientists,0.800000
223,2024,farmworkers and,technicians except,0.700000
...,...,...,...,...
47947,2024,meter readers,carpenters,0.714286
47949,2024,meter readers,process workers,0.714286
47952,2024,meter readers,cleaning workers,0.629630
47954,2024,meter readers,marine oilers,0.666667


In [113]:
## Creamos la gráfica
G_onet_recod = nx.from_pandas_edgelist(prox_df_link_recod, "onet_code_reclasificado_1", "onet_code_reclasificado_2", ["proximity"])

In [114]:
Sigma(G_onet_recod, node_metrics=["louvain"], node_color="louvain", node_size=nx.eigenvector_centrality(G_onet_recod))

Sigma(nx.Graph with 199 nodes and 2,517 edges)

# Calculemos la complejidad de las ocupaciones

In [115]:
trade_cols = {'time':'year', 'loc':'onet_code_reclasificado', 'prod':'element_id', 'val':'Data Value'}

cdata = ecomplexity(features_occupations_recod, trade_cols,rca_mcp_threshold = 1.2)

2024


In [116]:
cdata["onet_descripcion"] = cdata["onet_code_reclasificado"].replace(cw_keywords)
#{i:j for i,j in cdata.sort_values("eci")[["eci", "onet_descripcion"]].to_records(index = False)}

In [117]:
features_occupations_recod["year"] = 2024
trade_cols = {'time':'year', 'loc':'element_id', 'prod':'onet_code_reclasificado', 'val':'Data Value'}
prox_df_recod = proximity(features_occupations_recod, trade_cols,rca_mcp_threshold = 1.2)

## Creamos link si la proximidad es mayor a 0.5
prox_df_link_recod = prox_df_recod.query("proximity>0.6")
prox_df_link_recod["occupation_complexity"] = prox_df_link_recod["onet_code_reclasificado_1"].replace({i:j for i,j in cdata.sort_values("eci")[["onet_code_reclasificado", "eci"]].to_records(index = False)})
prox_df_link_recod

2024


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prox_df_link_recod["occupation_complexity"] = prox_df_link_recod["onet_code_reclasificado_1"].replace({i:j for i,j in cdata.sort_values("eci")[["onet_code_reclasificado", "eci"]].to_records(index = False)})


Unnamed: 0,year,onet_code_reclasificado_1,onet_code_reclasificado_2,proximity,occupation_complexity
35,2024,1,38,0.800000,-1.560448
65,2024,1,68,0.800000,-1.560448
68,2024,1,71,0.666667,-1.560448
89,2024,1,93,0.800000,-1.560448
223,2024,2,5,0.700000,0.678278
...,...,...,...,...,...
47947,2024,238,224,0.714286,0.753532
47949,2024,238,226,0.714286,0.753532
47952,2024,238,229,0.629630,0.753532
47954,2024,238,231,0.666667,0.753532


In [118]:
prox_df_link_recod["onet_code_reclasificado_1"] = prox_df_recod["onet_code_reclasificado_1"].replace(cw_keywords)
prox_df_link_recod["onet_code_reclasificado_2"] = prox_df_recod["onet_code_reclasificado_2"].replace(cw_keywords)
prox_df_link_recod

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prox_df_link_recod["onet_code_reclasificado_1"] = prox_df_recod["onet_code_reclasificado_1"].replace(cw_keywords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prox_df_link_recod["onet_code_reclasificado_2"] = prox_df_recod["onet_code_reclasificado_2"].replace(cw_keywords)


Unnamed: 0,year,onet_code_reclasificado_1,onet_code_reclasificado_2,proximity,occupation_complexity
35,2024,postsecondary teachers,education administrators,0.800000,-1.560448
65,2024,postsecondary teachers,engineering managers,0.800000,-1.560448
68,2024,postsecondary teachers,music directors,0.666667,-1.560448
89,2024,postsecondary teachers,medical scientists,0.800000,-1.560448
223,2024,farmworkers and,technicians except,0.700000,0.678278
...,...,...,...,...,...
47947,2024,meter readers,carpenters,0.714286,0.753532
47949,2024,meter readers,process workers,0.714286,0.753532
47952,2024,meter readers,cleaning workers,0.629630,0.753532
47954,2024,meter readers,marine oilers,0.666667,0.753532


In [119]:
## Creamos la gráfica
G_onet_recod = nx.from_pandas_edgelist(prox_df_link_recod, "onet_code_reclasificado_1", "onet_code_reclasificado_2", ["proximity", "occupation_complexity"])

In [120]:
node_size = {i:j for i,j in prox_df_link_recod[["onet_code_reclasificado_1", "occupation_complexity"]].to_records(index=False)}
edge_size = {i:j for i,j in prox_df_link_recod[["onet_code_reclasificado_1", "proximity"]].to_records(index=False)}
Sigma(G_onet_recod, node_metrics=["louvain"], node_color="louvain", 
      node_size= node_size, 
      node_border_size = node_size , 
      node_border_color = "louvain")

Sigma(nx.Graph with 199 nodes and 2,517 edges)

In [None]:
G_onet_recod.