In [1]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from tqdm.auto import tqdm
import pandas as pd

In [69]:
def make_docs(file_path):
    train_data = pd.read_csv(file_path)
    train_data.dropna(axis=0, how='any', inplace=True)
    train_data['Num_words_text'] = train_data['content'].apply(lambda x: len(str(x).split()))
    mask = train_data['Num_words_text'] > 2
    train_data = train_data[mask]
    print(train_data['pred_ind'].value_counts())

    data = []
    for text, label in zip(train_data['content'].tolist(), train_data['pred_ind'].tolist()):
        label_dict = {
            '10': 1 if str(label) == '10' else 0,
            '13': 1 if str(label) == '13' else 0,
            '8': 1 if str(label) == '8' else 0,
            '12': 1 if str(label) == '12' else 0
        }
        data.append((text, label_dict))
    
    print(data[1])
    
    docs = []
    nlp = spacy.load("en_core_web_trf")

    label_mapping = {
        '10': {'10': 1, '13': 0, '8': 0, '12': 0},
        '8': {'10': 0, '13': 0, '8': 1, '12': 0},
        '13': {'10': 0, '13': 1, '8': 0, '12': 0},
        '12': {'10': 0, '13': 0, '8': 0, '12': 1},
    }

    for text, label_dict in tqdm(data, total=len(data)):
        doc = nlp(text)
        label = label_dict.get('pred_ind')
        if label:
            doc.cats = label_mapping.get(label, {'10': 0, '13': 0, '8': 0, '12': 0})
        else:
            doc.cats = {'10': 0, '13': 0, '8': 0, '12': 0}
        docs.append(doc)

    return docs, train_data


In [72]:
train_docs,train_data = make_docs('test_textcat_project/textcat_data/training_data.csv')
# then we save it in .spacy verion on disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("test_textcat_project/textcat_data/textcat_train.spacy")

test_docs,test_data = make_docs('test_textcat_project/textcat_data/test_data.csv')
# then we save it in .spacy verion on disc
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("test_textcat_project/textcat_data/textcat_test.spacy")

8     20
12    16
10    10
13     4
Name: pred_ind, dtype: int64
('Emulsifiers and stabilisers are used in ice cream manufacturing. Palsgaard inaugurated the new emulsifier plant in Malaysia in August 2013. Image courtesy of Palsgaard. The new plant, fully owned and operated by Palsgaard, is located in Johor, Malaysia. Image courtesy of TUBS. Palsgaard, a Danish company, inaugurated a new emulsifier manufacturing plant in Malaysia in August 2013. The new plant is located in Nusajaya city in the state of Johor. The new plant is fully owned and operated by Palsgaard. Palsgaardâ€™s emulsifiers are used in industrial bakery and cake mixes, fine foods, chocolates and confectionery, sauces, ketchups, margarine and shortenings, dairy, ice creams and meat products. The company invested $8.2m in a Regional Application Centre in Singapore for dairy, ice cream, soya, bakery and confectionery applications. It is also planning to make further investments in Asia to grow the market for its products.

  0%|          | 0/50 [00:00<?, ?it/s]

8     3
10    3
13    3
12    2
Name: pred_ind, dtype: int64
('Mining and exploration firm Korab Resources has secured all permits required to develop a new phosphate mine in Australia.  Korab claims that the new phosphate mine at the Geolsec project has some of the best logistics and basic infrastructure of any rock phosphate project in Australia.  The mine is situated 2km from Batchelor town, about 70km south from the port of Darwin, which is the capital of the Northern Territory. Korab noted that it had started negotiations with users of rock phosphate, fertilisers and soil improvement products as well as with distributors and contractors providing aerial application services of fertilisers and soil improvement products in the Northern Territory, Western Australia and Eastern States.  The start-up costs of the mine are low because of its geology and location and the company anticipates that it will cost less than $300,000 to begin production of the direct application phosphate ferti

  0%|          | 0/11 [00:00<?, ?it/s]

In [21]:
#! python -m spacy init fill-config ./test_textcat_project/base_config_multi.cfg ./test_textcat_project/textcat_config_multi.cfg

In [73]:
! python -m spacy train test_textcat_project/textcat_config.cfg --output ./test_textcat_project --paths.train test_textcat_project/textcat_data/textcat_train.spacy --paths.dev test_textcat_project/textcat_data/textcat_test.spacy

ℹ Saving to output directory: test_textcat_project
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['textcat']
ℹ Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.06       25.00    0.25
  4     200         12.50       25.00    0.25
  9     400         12.50       25.00    0.25
 14     600         12.50       25.00    0.25
 19     800         12.50       25.00    0.25
 24    1000         12.50       25.00    0.25
 28    1200         12.50       25.00    0.25
 33    1400         12.50       25.00    0.25
 38    1600         12.50       25.00    0.25
✔ Saved pipeline to output directory
test_textcat_project\model-last


[2023-07-03 16:43:33,747] [INFO] Set up nlp object from config
[2023-07-03 16:43:33,762] [INFO] Pipeline: ['textcat']
[2023-07-03 16:43:33,766] [INFO] Created vocabulary
[2023-07-03 16:43:33,767] [INFO] Finished initializing nlp object
[2023-07-03 16:43:34,316] [INFO] Initialized pipeline components: ['textcat']


In [74]:
nlp_textcat = spacy.load('test_textcat_project/model-best')
test_texts = test_data['content'].tolist()
test_cats = test_data['pred_ind'].tolist()

doc2 = nlp_textcat(test_texts[0])
print("Text: "+ test_texts[0])
print("Orig Cat: " + str(test_cats[0]))
print(" Predicted Cats:") 
print(doc2.cats)
print("=========================================================")
doc2 = nlp_textcat(test_texts[5])
print("Text: "+ test_texts[5])
print(" Orig Cat:"+ str(test_cats[5]))
print(" Predicted Cats:") 
print(doc2.cats)

Text: Kingsgate Consolidated has completed a definitive feasibility study (DFS) at its Nueva Esperanza silver-gold heap leach project in Chile, which concluded that the development would require a capital cost of $140m.  The Nueva Esperanza project consists of three deposits, Arqueros, Teterita and Chimberos, located in northern Chile.  Kingsgate said that the DFS is based on a three million tonnes (Mt) a year heap leach operation with on-site power generation. The company intends to bring the project into production and create an operating base to identify additional areas of mineralisation within the current exploration license areas. The DFS found that the project is estimated to produce about 7.5 million ounces (Moz) a year of silver equivalent, based on the current reserves of 17.1Mt grading at 97g per tonne of silver and 0.27g per tonne of gold.  Based on the anticipated metallurgical recovery factors from test work to date, the projectâ€™s planned production is expected to be 40