# Pipeline de Extração de Conhecimento
### Utilizando spaCy para NER + TF-IDF para Keywords

## Importação de bibliotecas

In [2]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import json


## Carregamento do Dataset

In [3]:
df = pd.read_csv(r"C:\Users\gonsa\OneDrive\Desktop\chatbot\etapa_i\dataset_processado.csv")
df.head()



Unnamed: 0,section,text,clean_text,tokens,tokens_nostop
0,Welcome Message,Welcome to ICVS!,welcome to icvs,"['welcome', 'icvs']","['welcome', 'icvs']"
1,Welcome Message,We’re excited to have you join our community o...,were excited to have you join our community of...,"['were', 'excited', 'have', 'you', 'join', 'ou...","['excited', 'join', 'community', 'researcher',..."
2,Welcome Message,"Together, we strive to conduct research of exc...",together we strive to conduct research of exce...,"['together', 'strive', 'conduct', 'research', ...","['together', 'strive', 'conduct', 'research', ..."
3,Welcome Message,Welcome aboard!,welcome aboard,"['welcome', 'aboard']","['welcome', 'aboard']"
4,Introduction,This guidebook was designed to facilitate your...,this guidebook was designed to facilitate your...,"['this', 'guidebook', 'was', 'designed', 'faci...","['guidebook', 'designed', 'facilitate', 'arriv..."


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     --------------------------------------- 1.8/400.7 MB 12.8 MB/s eta 0:00:32
     --------------------------------------- 4.5/400.7 MB 12.3 MB/s eta 0:00:33
      -------------------------------------- 6.3/400.7 MB 11.3 MB/s eta 0:00:35
      -------------------------------------- 8.9/400.7 MB 11.3 MB/s eta 0:00:35
      ------------------------------------- 10.2/400.7 MB 10.4 MB/s eta 0:00:38
     - ------------------------------------- 11.3/400.7 MB 9.5 MB/s eta 0:00:42
     - ------------------------------------- 12.8/400.7 MB 9.0 MB/s eta 0:00:44
     - ------------------------------------- 13.9/400.7 MB 8.5 MB/s eta 0:00:46
     - ------------------------------------- 14.7/400.7 MB 8.1 MB/s eta 0:00:48
     - -----------------------

## Carregamento Modelo spaCy

In [4]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [5]:
nlp = spacy.load("en_core_web_lg")


## Função de NER (Named Entity Recognition)

In [6]:
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append({"text": ent.text, "label": ent.label_})
    return entities

df["entities"] = df["text"].apply(extract_entities)
df[["text", "entities"]].head()


Unnamed: 0,text,entities
0,Welcome to ICVS!,"[{'text': 'ICVS', 'label': 'ORG'}]"
1,We’re excited to have you join our community o...,"[{'text': 'first', 'label': 'ORDINAL'}]"
2,"Together, we strive to conduct research of exc...","[{'text': 'Portugal', 'label': 'GPE'}]"
3,Welcome aboard!,[]
4,This guidebook was designed to facilitate your...,[]


## Extração de Keywords com TF-IDF

In [7]:
#treinar TF-IDF
tfidf = TfidfVectorizer(stop_words="english", max_features=2000)
tfidf_matrix = tfidf.fit_transform(df["text"])
feature_names = tfidf.get_feature_names_out()


In [8]:
# Função para extrair keywords de cada documento
import numpy as np

def get_keywords(row_index, top_k=5):
    row = tfidf_matrix[row_index].toarray().flatten()
    top_indices = row.argsort()[-top_k:][::-1]
    return [feature_names[i] for i in top_indices]

df["keywords"] = [get_keywords(i) for i in range(len(df))]
df[["text", "keywords"]].head()


Unnamed: 0,text,keywords
0,Welcome to ICVS!,"[welcome, icvs, 2013, 2010, 2005]"
1,We’re excited to have you join our community o...,"[new, feel, overwhelming, spaces, join]"
2,"Together, we strive to conduct research of exc...","[meaningfully, strive, energy, careers, innova..."
3,Welcome aboard!,"[aboard, welcome, 2013, 2010, 2005]"
4,This guidebook was designed to facilitate your...,"[facilitate, clarify, carry, tasks, initial]"


## Criar Estrutura em JSON (Conhecimento)

In [9]:
{
  "section": "...",
  "text": "...",
  "entities": [...],
  "keywords": [...]
}


{'section': '...',
 'text': '...',
 'entities': [Ellipsis],
 'keywords': [Ellipsis]}

In [10]:
knowledge_base = df.to_dict(orient="records")

with open(r"C:\Users\gonsa\OneDrive\Desktop\chatbot\icvs_Ext_Con.json", "w", encoding="utf-8") as f:
    json.dump(knowledge_base, f, indent=2, ensure_ascii=False)

print("JSON criado com sucesso!")


JSON criado com sucesso!


## Visualização Rápida

In [28]:
from collections import Counter

all_entities = []

for ents in df["entities"]:
    for e in ents:
        all_entities.append(e["label"])

Counter(all_entities).most_common(10)


[('ORG', 154),
 ('PERSON', 25),
 ('DATE', 16),
 ('CARDINAL', 16),
 ('GPE', 8),
 ('NORP', 7),
 ('WORK_OF_ART', 6),
 ('TIME', 5),
 ('ORDINAL', 4),
 ('LAW', 3)]

## Exemplo Real (para Relatório)

In [29]:
example_index = 44 

print("Texto:")
print(df.loc[example_index, "text"])
print("\nEntities:")
print(df.loc[example_index, "entities"])
print("\nKeywords:")
print(df.loc[example_index, "keywords"])


Texto:
 Onboarding Form →  Safety Materials →  Review →  Exam →  Letter of Commitment →  Access Card

Entities:
[{'text': 'Safety Materials', 'label': 'ORG'}, {'text': 'Letter of Commitment', 'label': 'WORK_OF_ART'}, {'text': 'Access Card', 'label': 'PRODUCT'}]

Keywords:
['exam', 'letter', 'review', 'materials', 'commitment']
