In [3]:
import spacy
import os
from collections import defaultdict
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [4]:

def extract_named_entities(folder_path):
    entity_counts = defaultdict(lambda: defaultdict(int))

    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)

        if os.path.isdir(category_path):
            print(f"Processing category: {category}")

            for file_name in os.listdir(category_path):
                file_path = os.path.join(category_path, file_name)

                if file_name.endswith('.txt'):
                    try:
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                            text = file.read()

                            doc = nlp(text)

                            for ent in doc.ents:
                                entity_key = f"{ent.text}_{ent.label_}"
                                entity_counts[entity_key][category] += 1
                    except Exception as e:
                        print(f"Error processing file {file_name}: {e}")

    return entity_counts

In [5]:


bbc_dataset_path = "bbc"

entity_data = extract_named_entities(bbc_dataset_path)

data = []
categories = sorted(os.listdir(bbc_dataset_path))
for entity, counts in entity_data.items():
    row = {"word": entity}
    for category in categories:
        row[category] = counts.get(category, 0)
    data.append(row)

entity_df = pd.DataFrame(data).fillna(0)

entity_df.to_csv("bbc_named_entities.csv", index=False)

entity_df.head()


Processing category: entertainment
Processing category: business
Processing category: sport
Processing category: politics
Processing category: tech


Unnamed: 0,word,.DS_Store,business,entertainment,politics,sport,tech
0,US_GPE,0,750,315,86,49,297
1,British_NORP,0,17,162,152,83,23
2,Atlantic_LOC,0,1,8,3,1,0
3,"1,300_MONEY",0,0,1,0,0,0
4,680_MONEY,0,0,1,1,0,0
