In [2]:
from datasets import load_dataset, get_dataset_config_names
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ner_data_subset = get_dataset_config_names("xtreme")

In [4]:
panx_subests = [s for s in ner_data_subset if s.startswith("PAN")]
panx_subests

['PAN-X.af',
 'PAN-X.ar',
 'PAN-X.bg',
 'PAN-X.bn',
 'PAN-X.de',
 'PAN-X.el',
 'PAN-X.en',
 'PAN-X.es',
 'PAN-X.et',
 'PAN-X.eu',
 'PAN-X.fa',
 'PAN-X.fi',
 'PAN-X.fr',
 'PAN-X.he',
 'PAN-X.hi',
 'PAN-X.hu',
 'PAN-X.id',
 'PAN-X.it',
 'PAN-X.ja',
 'PAN-X.jv',
 'PAN-X.ka',
 'PAN-X.kk',
 'PAN-X.ko',
 'PAN-X.ml',
 'PAN-X.mr',
 'PAN-X.ms',
 'PAN-X.my',
 'PAN-X.nl',
 'PAN-X.pt',
 'PAN-X.ru',
 'PAN-X.sw',
 'PAN-X.ta',
 'PAN-X.te',
 'PAN-X.th',
 'PAN-X.tl',
 'PAN-X.tr',
 'PAN-X.ur',
 'PAN-X.vi',
 'PAN-X.yo',
 'PAN-X.zh']

In [5]:
en = load_dataset("xtreme", name="PAN-X.en")

In [6]:
en

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [8]:
pd.DataFrame(en["train"][0]).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,R.H.,Saunders,(,St.,Lawrence,River,),(,968,MW,)
ner_tags,3,4,0,3,4,4,0,0,0,0,0
langs,en,en,en,en,en,en,en,en,en,en,en


In [13]:
en["train"].features["ner_tags"].feature.names


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [10]:
pd.DataFrame(en["train"])

Unnamed: 0,tokens,ner_tags,langs
0,"[R.H., Saunders, (, St., Lawrence, River, ), (...","[3, 4, 0, 3, 4, 4, 0, 0, 0, 0, 0]","[en, en, en, en, en, en, en, en, en, en, en]"
1,"[;, ', '', Anders, Lindström, '', ']","[0, 0, 0, 1, 2, 0, 0]","[en, en, en, en, en, en, en]"
2,"[Karl, Ove, Knausgård, (, born, 1968, )]","[1, 2, 2, 0, 0, 0, 0]","[en, en, en, en, en, en, en]"
3,"[Atlantic, City, ,, New, Jersey]","[5, 6, 6, 6, 6]","[en, en, en, en, en]"
4,"[Her, daughter, from, the, second, marriage, w...","[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, ...","[en, en, en, en, en, en, en, en, en, en, en, e..."
...,...,...,...
19995,"[Cicely, Courtneidge, ,, Ernest, Truex]","[1, 2, 0, 1, 2]","[en, en, en, en, en]"
19996,"[Aracaju, ,, Sergipe, ,, Brazil]","[5, 0, 5, 0, 5]","[en, en, en, en, en]"
19997,"[Louisville, in, the, American, Civil, War]","[5, 6, 6, 6, 6, 6]","[en, en, en, en, en, en]"
19998,"[16, (, David, Nugent, )]","[0, 0, 1, 2, 0]","[en, en, en, en, en]"


In [14]:
pd.DataFrame(en["train"][1]).T

Unnamed: 0,0,1,2,3,4,5,6
tokens,;,','',Anders,Lindström,'','
ner_tags,0,0,0,1,2,0,0
langs,en,en,en,en,en,en,en


In [19]:
en["train"][1]

{'tokens': [';', "'", "''", 'Anders', 'Lindström', "''", "'"],
 'ner_tags': [0, 0, 0, 1, 2, 0, 0],
 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en']}

In [18]:
" ".join(en["train"][1]["tokens"])

"; ' '' Anders Lindström '' '"

In [20]:
en["train"][1]["ner_tags"]

[0, 0, 0, 1, 2, 0, 0]

In [21]:
en["train"].features["ner_tags"].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [None]:
from ner.entity.config_entity import DataIngestionConfig
from ner.exception import CustomException
from ner.logger import logging
from ner.constant import *
import sys,os
from ner.utils import read_yaml_file

class Configuration:
    def __init__(self, config_file_path = CONFIG_FILE_PATH) -> None:
        try:
            logging.info("Reading yaml file.....")
            self.config_info = read_yaml_file(file_path=config_file_path)
        except Exception as e:
            raise CustomException(e,sys) from e    
        
    def get_data_ingestion_config(self)->DataIngestionConfig:
        try:
            artifacts_dir = os.path.join(ROOT_DIR,self.config_info[ARTIFACTS_DIR_KEY])
            
            dataset_name = self.config_info[DATA_INGESTION_KEY][DATASET_NAME]
            subset_name = self.config_info[DATA_INGESTION_KEY][SUBSET_NAME]
            data_store = os.path.join(artifacts_dir,self.config_info[DATA_STORE_KEY])
            
            data_ingestion_config = DataIngestionConfig(
                dataset_name = dataset_name,
                subset_name=subset_name,
                data_path=data_store
            )
            
            return data_ingestion_config
            
        except Exception as e:
            raise CustomException(e,sys) from e
            

: 

In [1]:
import os


In [2]:
os.getcwd()

'd:\\projects\\NER-Project\\research'

In [8]:
os.chdir("..")

In [9]:
os.getcwd()

'd:\\projects\\NER-Project'

In [11]:
from ner.entity.config_entity import DataIngestionConfig
from ner.exception import CustomException
from ner.logger import logging
from ner.constant import *
from from_root import from_root
import sys,os
from ner.utils import read_yaml_file

In [12]:
config_file_path =  CONFIG_FILE_PATH

In [13]:
config_info = read_yaml_file(file_path=config_file_path)

In [14]:
config_info

{'artifacts': 'artifacts',
 'data_ingestion_config': {'data_store': 'data',
  'dataset_name': 'xtreme',
  'subset_name': 'PAN-X.en'}}

In [16]:
config_info[DATA_INGESTION_KEY][DATA_STORE_KEY]

'data'

In [17]:
config_info[DATA_INGESTION_KEY][DATA_STORE_KEY]

'data'