In [3]:
# !pip install pandas

In [3]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

input_dir = Path(f"input_dir/github.txt")

output_dir = Path(f"output_dir")

In [2]:
docs = TextLoader(input_dir).load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(docs)
print("Number of chunks = ", len(pages))

Number of chunks =  40


## Creating Dataframe from chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(40, 3)


Unnamed: 0,text,source,chunk_id
0,# Alaska: Google Summer of Code (GSoC) \n<img ...,input_dir\github.txt,902292e89f814a7daa128be1540458fe
1,"We represent the 49th state, Alaska. Anchorage...",input_dir\github.txt,f854d57405064d5a8d4aa3f6f2314d1e
2,**Please first refer to the [contributor guide...,input_dir\github.txt,2a800036ce784063b9b0aeed28746e3d
3,**Overview:** The rapidly warming Arctic is le...,input_dir\github.txt,88d8c61fddbc4d14b8f562aa688f7f94
4,**Current Status:** An initial model and pipel...,input_dir\github.txt,624525f075e845c789de613f599036fc


## Extract Concepts

In [5]:
from helpers.df_helpers import df2ConceptsList
from helpers.df_helpers import concepts2Df

concepts_list = df2ConceptsList(df)

  from .autonotebook import tqdm as notebook_tqdm




ERROR ### Here is the buggy response:  [
       {
           "entity": "concore",
           "importance": 4,
           "category": concept
       },
       {
           "entity": "osparc-control",
           "importance": 4,
           "category": concept
       },
       {
           "entity": "ZeroMQ",
           "importance": 3,
           "category": concept
       },
       {
           "entity": "file-sharing",
           "importance": 3,
           "category": concept
       },
       {
           "entity": "message queue",
           "importance": 3,
           "category": concept
       },
       {
           "entity": "local machine",
           "importance": 2,
           "category": concept
       },
       {
           "entity": "networked machine",
           "importance": 2,
           "category": concept
       },
       {
           "entity": "concore editor",
           "importance": 2,
           "category": concept
       },
       {
           "entity": "browse

In [6]:
concepts_list[:2]

[{'entity': 'Alaska',
  'importance': 5,
  'category': 'organisation',
  'chunk_id': '902292e89f814a7daa128be1540458fe',
  'type': 'concept'},
 {'entity': 'Google Summer of Code',
  'importance': 5,
  'category': 'event',
  'chunk_id': '902292e89f814a7daa128be1540458fe',
  'type': 'concept'}]

In [8]:
dfne = concepts2Df(concepts_list)
dfne.head()

Unnamed: 0,entity,importance,category,chunk_id,type
0,alaska,5,organisation,902292e89f814a7daa128be1540458fe,concept
1,google summer of code,5,event,902292e89f814a7daa128be1540458fe,concept
2,mentor,3,occupation,902292e89f814a7daa128be1540458fe,concept
3,open-source,3,concept,902292e89f814a7daa128be1540458fe,concept
4,software,3,concept,902292e89f814a7daa128be1540458fe,concept


In [10]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
dfne.to_csv(output_dir/"concepts.csv", sep="|", index=False)
df.to_csv(output_dir/"chunks.csv", sep="|", index=False)

## Named Entities from concepts

In [1]:
from transformers import pipeline

ner = pipeline("token-classification", model="2rtl3/mn-xlm-roberta-base-named-entity", aggregation_strategy="simple")
# ner = pipeline("token-classification", model="dslim/bert-large-NER", aggregation_strategy="simple")

def row2NamedEntities(row):
    ner_results = ner(row['entity'])
    metadata = {'chunk_id': row['chunk_id'], 'type': 'entity'}
    entities = []
    for result in ner_results:
        entities = entities + [{'entity': result['word'], 'catetory': result['entity_group'], **metadata}]
        
    return entities



def dfText2DfNE(dataframe: pd.DataFrame):
    ## Takes a dataframe from the parsed data and returns dataframe with named entities. 
    ## The input dataframe must have a entity and a chunk_id column. 

    ## 1. Calculate named entities for each row of the dataframe. 
    results = dataframe.apply(row2NamedEntities, axis=1).reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities. 
    entities_list = np.concatenate(results).ravel().tolist()

    ## Remove all NaN entities
    entities_dataframe = pd.DataFrame(entities_list).replace(' ', np.nan)
    entities_dataframe = entities_dataframe.dropna(subset=['entity'])

    ## Count the number of occurances per chunk id
    # entities_dataframe = entities_dataframe.groupby(['entity', 'category', 'chunk_id']).size().reset_index(name='count')

    return entities_dataframe

In [4]:
df_concepts = pd.read_csv(f"./output_dir/concepts.csv", sep="|")

dfc_split = dfText2DfNE(df_concepts)