In [1]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from tqdm.auto import tqdm
import pandas as pd
import os

In [2]:
def make_docs(file_path):
    
    # Pre-processing/cleaning up file
    
    train_data = pd.read_csv(file_path)
    train_data.dropna(axis=0, how='any', inplace=True)
    train_data['Num_words_text'] = train_data['content'].apply(lambda x:len(str(x).split()))
    mask = train_data['Num_words_text'] > 2
    train_data = train_data[mask]
    print(train_data['pred_ind'].value_counts())
    
    data = tuple(zip(train_data['content'].tolist(), train_data['pred_ind'].tolist()))
    print(data[1])
    
    # Data Mapping
    
    labels_mapping = {
        1: '01',
        2: '02',
        3: '03',
        4: '04',
        5: '05',
        6: '06',
        7: '07',
        8: '08',
        9: '09',
        10: '10',
        12: '12',
        13: '13',
        99: '99'
    }
    
    docs = []
    nlp = spacy.load("en_core_web_md") # Choose your model
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        doc.cats.clear()
        if label in labels_mapping:
            doc.cats[labels_mapping[label]] = 1
        else:
            doc.cats['Other'] = 1
        docs.append(doc)
        
    return docs, train_data

In [3]:
'''
# Load and process each training chunk separately
train_chunk_paths = [
    'test_textcat_project/textcat_data/training_data_chunk1.csv',
    'test_textcat_project/textcat_data/training_data_chunk2.csv',
    'test_textcat_project/textcat_data/training_data_chunk3.csv'
]
train_chunk_output_dir = 'test_textcat_project/textcat_data/train_chunks'
os.makedirs(train_chunk_output_dir, exist_ok=True)

for i, chunk_path in enumerate(train_chunk_paths):
    chunk_docs, _ = make_docs(chunk_path)

    # Save the training chunk as a separate .spacy file
    train_chunk_doc_bin = DocBin(docs=chunk_docs)
    chunk_output_path = os.path.join(train_chunk_output_dir, f'textcat_train_{i}.spacy')
    train_chunk_doc_bin.to_disk(chunk_output_path)
'''

# Load and process the train data
train_docs, train_data = make_docs('test_textcat_project/textcat_data/training_data.csv')

# Save the train documents as a .spacy file
train_doc_bin = DocBin(docs=train_docs)
train_doc_bin.to_disk("test_textcat_project/textcat_data/textcat_train.spacy")

# Load and process the test data
test_docs, test_data = make_docs('test_textcat_project/textcat_data/test_data.csv')

# Save the test documents as a .spacy file
test_doc_bin = DocBin(docs=test_docs)
test_doc_bin.to_disk("test_textcat_project/textcat_data/textcat_test.spacy")

3     4381
9     3548
10    3478
8     3431
1     3353
6     3320
5     3258
12    3199
13    2885
7     2743
4     2686
2     2462
99     830
Name: pred_ind, dtype: int64
('As President Biden met with Saudi Arabian leaders to discuss his request to increase oil production and lower U.S. gasoline prices on Friday, global prices rose while those back in the states dropped.    Global benchmark Brent crude for September delivery rose $2.06 and closed back up over $100 at $101.16 a barrel on ICE Futures Europe.    In the U.S., West Texas Intermediate crude for August delivery dropped $1.81 to $97.59 a barrel on the New York Mercantile Exchange.    U.S. prices had risen earlier in the day but finished with the loss as analysts contend the concern over an anticipated interest rate hike by the Federal Reserve in two week lessened.    There remains worry the world economy is heading into a recession as steep interest rates are not only expected in the U.S. but in Canada, New Zealand, Chile, So

  0%|          | 0/39574 [00:00<?, ?it/s]

3     785
10    623
9     612
5     595
1     577
8     576
6     576
13    550
12    545
4     489
7     476
2     433
99    143
Name: pred_ind, dtype: int64
('By Christina L. Meyers and Jeffrey Collins, Associated Press COLUMBIA, South Carolina Ã¢â‚¬â€\x9d South Carolinaâ€™s state-owned utility paid $9 million in performance bonuses to executives of a private utility for two nuclear reactors that were never finished, according to the public utility and emails turned over to state and federal investigators. SCANA Corp. even billed taxpayer-supported Santee Cooper $3.2 million for bonuses in August, a month after the utilities abandoned 10 years of construction and planning for the reactors, according to the emails released by Gov. Henry Mc Masterâ€™s office on Wednesday. Santee Cooper refused to pay, utility spokeswoman Mollie Gore said. â€œI will not approve this invoice,â€\x9d Senior Vice President for Nuclear Energy Michael Crosby wrote in one email. â€œI may get over-ridden â€¦ bu

  0%|          | 0/6980 [00:00<?, ?it/s]

In [5]:
#! python -m spacy init fill-config ./test_textcat_project/base_config_multi.cfg ./test_textcat_project/textcat_config_multi.cfg

In [4]:
! python -m spacy train test_textcat_project/textcat_config.cfg --output ./test_textcat_project --paths.train test_textcat_project/textcat_data/textcat_train.spacy --paths.dev test_textcat_project/textcat_data/textcat_test.spacy

ℹ Saving to output directory: test_textcat_project
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['textcat']
ℹ Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.07        1.17    0.01
  0     200         13.54       33.12    0.33
  0     400         11.37       33.39    0.33
  0     600         10.79       44.68    0.45
  0     800          9.89       53.77    0.54
  0    1000          8.41       54.62    0.55
  0    1200          6.80       59.24    0.59
  0    1400         10.06       60.30    0.60
  0    1600          7.69       60.77    0.61
  0    1800          7.96       62.87    0.63
  0    2000          8.21       64.68    0.65
  0    2200          8.00       64.90    0.65
  0    2400          8.30       62.64    0.63
  0    2600          6.50       69.37    0.69
  0    2800          7.96       64.84    0.65
  0    3000          7.92       69.09    0.69
  0    3200          8.0

[2023-07-10 14:46:58,394] [INFO] Set up nlp object from config
[2023-07-10 14:46:58,408] [INFO] Pipeline: ['textcat']
[2023-07-10 14:46:58,411] [INFO] Created vocabulary
[2023-07-10 14:46:58,412] [INFO] Finished initializing nlp object
[2023-07-10 14:50:26,794] [INFO] Initialized pipeline components: ['textcat']


In [8]:
nlp_textcat = spacy.load('test_textcat_project/model-best')
test_texts = test_data['content'].tolist()
test_cats = test_data['pred_ind'].tolist()
test_src = test_data['src']

# Output's scores from highest to lowest

def sorted_cats(doc):
    sorted_cats = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)
    for cat, value in sorted_cats:
        print(f"{cat}: {value}")

article_index = 6201 # Index of the Article from test.csv
doc2 = nlp_textcat(test_texts[article_index])
print("Text: " + test_texts[article_index])
print("Orig Cat: " + str(test_cats[article_index]))
print("Predicted Cats:")
sorted_cats(doc2)


Text: Drilling at Mt Dimer has been extremely limited, with only four drill-holes completed in the last 25 years.     () (FRA: U9V) has appointed Ausdrill to undertake a reverse circulation (RC) drilling campaign at the Mt Dimer Mining Lease in Western Australia, to test below and along strike of the existing open pit, clearing the way for drilling to start by mid-February 2021.        This maiden RC drilling campaign is designed to take two to three weeks, comprising 15 to 25 holes for up to 3,300 metres.        Work at Mt Dimer hasbeen extremely limited, with only four drill-holes completed in the last 25 years.      Drilling will focus on confirming and extending gold mineralisation, targeting priority areas around historical intercepts comprising: 5 metres at 10.64 g / t gold;   6 metres at 13.3 g / t; and   19 metres at 3.4 g / t.      TSCs newly appointed CEO Simon Phillips said: With a drilling contractor now officially appointed for the Mt Dimer Mining Lease, our logistics team

In [6]:
def sorted_cats(doc):
    sorted_cats = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)
    return sorted_cats

# Generate predicted labels and scores
predicted_labels = []
predicted_scores = []

for text in test_texts:
    doc = nlp_textcat(text)
    sorted_categories = sorted_cats(doc)
    labels = []
    scores = []
    for i, (label, score) in enumerate(sorted_categories):
        labels.append(label)
        scores.append(score)
    labels.extend([None] * (13 - len(labels)))
    scores.extend([None] * (13 - len(scores)))
    predicted_labels.append(labels)
    predicted_scores.append(scores)

# Create a new DataFrame with predicted labels and scores
results_data = test_data.copy()
results_data[['predicted_label_{}'.format(i) for i in range(1, 14)]] = pd.DataFrame(predicted_labels)
results_data[['predicted_score_{}'.format(i) for i in range(1, 14)]] = pd.DataFrame(predicted_scores)

# Remove the "Num_words_text" column
results_data.drop("Num_words_text", axis=1, inplace=True)

# Save the results to a new CSV file, with separate columns for label and score
results_data.to_csv('test_textcat_project/textcat_data/textcat_results.csv', index=False)