In [2]:
import stanza
import nltk
from nltk.corpus import stopwords

stanza.download('en')
nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 9.82MB/s]                    
2024-04-11 19:06:45 INFO: Downloaded file to /Users/omarmhawash/stanza_resources/resources.json
2024-04-11 19:06:45 INFO: Downloading default packages for language: en (English) ...
2024-04-11 19:06:46 INFO: File exists: /Users/omarmhawash/stanza_resources/en/default.zip
2024-04-11 19:06:50 INFO: Finished downloading models and saved to /Users/omarmhawash/stanza_resources
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/omarmhawash/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omarmhawash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#* pipeline for lemmatization and tokenization 
#? required: ['lemma', 'tokenize', 'mwt']
processors = 'tokenize, lemma, mwt'

tkn = stanza.Pipeline(lang='en', processors=processors, use_gpu=False)

#* set of stop words
stop_words = set(stopwords.words('english'))

2024-04-11 19:06:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 7.59MB/s]                    
2024-04-11 19:06:51 INFO: Downloaded file to /Users/omarmhawash/stanza_resources/resources.json
2024-04-11 19:06:51 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-04-11 19:06:51 INFO: Using device: cpu
2024-04-11 19:06:51 INFO: Loading: tokenize
2024-04-11 19:06:52 INFO: Loading: mwt
2024-04-11 19:06:52 INFO: Loading: lemma
2024-04-11 19:06:52 INFO: Done loading processors!


#### 2. Preparing dataset:
**a.** formatting `classes` as a dict of text files.

In [4]:
import os

documents = os.listdir('data/training')
classes = {}

for i in range(len(documents)):
  if documents[i] != '.DS_Store':
    classes[documents[i]] = {
      'files' : os.listdir(f'data/training/{documents[i]}'),
    }

# example output
t_class = 'tin'
print(f't_class',classes[t_class])

t_class {'files': ['0007261', '0000552', '0005647', '0001212', '0006258', '0009266', '0004855', '0005514', '0000413', '0000220', '0005308', '0005901', '0005895', '0002871', '0005261', '0005894', '0007882', '0008291']}


#### 2. Preparing dataset:
**b.** creating a `train` dataframe. Also `texts` for building the vocabulary set.

In [5]:
import re

train = []
texts = []

for c in classes:
  for idx, f in enumerate(classes[c]['files']):
    with open(f'data/training/{c}/{f}', 'r') as file:
      filedata = file.read()
      mod_data = filedata.replace('\n', ' ').lower() #? lowercase here
      split_mod_data = re.split('[ ,.\'\"><]+', mod_data)
      filtered_data = ' '.join([w for w in split_mod_data if (not w in stop_words) and (len(w) > 1)])
      texts.append(filtered_data)
      train.append({'class': c, 'text': filtered_data })
    if idx > 1000:
      break

train[0] #? sample output

{'class': 'tin',
 'text': 'thai tin exports fall february bangkok march 27 thailand exported 120 tonnes tin metal february 816 tonnes previous month 140 tonnes year ago mineral resources department said said major buyers last month britain japan netherlands west germany'}

In [6]:
# #* creating train dataframe
import pandas as pd
train_df = pd.DataFrame(train)

# print(train_df['class'].value_counts())
train_df['text'].head(2) #? sample output

0    thai tin exports fall february bangkok march 2...
1    zaire accepts tin-export quota atpc says kuala...
Name: text, dtype: object

#### 3.	Tokenization and Vocabulary set extraction

In [7]:
# #* formatting to 'stanza' documents
in_docs = [stanza.Document([], text=d) for d in texts]

#* tokenization pipeline
tkn_docs = tkn(in_docs) #? time consuming...~ 4-6mins

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x111ccda10>>
Traceback (most recent call last):
  File "/Users/omarmhawash/Library/Python/3.11/lib/python/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

new_lemmas = [[token.lemma_ for token in nlp(d)] for d in texts]
new_flat_lemmas = [lemma for sublist in new_lemmas for lemma in sublist]

new_flat_lemmas[:10] #? sample output

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#* taking out tokens and flattening the sentences
lemmas = [[word.lemma for word in sentence.words] for doc in tkn_docs for sentence in doc.sentences] 
flat_lemma = [item for sublist in lemmas for item in sublist]

#* creating a count vectorizer
cn_vec = CountVectorizer()
cn = cn_vec.fit_transform(flat_lemma)

#? counting lemmas then finding the total words and vocab size
all_words = len(flat_lemma)
all_vocab = len(cn_vec.get_feature_names_out())

print(f'All words: {all_words}, All vocab: {all_vocab}')

All words: 1202487, All vocab: 25123


In [None]:
# print last 100 words
print(flat_lemma[:100])

['thai', 'tin', 'export', 'fall', 'february', 'bangkok', 'march', '27', 'Thailand', 'exported', '120', 'tonne', 'tin', 'metal', 'february', '816', 'tonne', 'previous', 'month', '140', 'tonne', 'year', 'ago', 'mineral', 'resource', 'department', 'say', 'say', 'major', 'buyer', 'last', 'month', 'britain', 'japan', 'netherlands', 'west', 'germany', 'zaire', 'accept', 'tin', '-', 'export', 'quota', 'atpc', 'say', 'kuala', 'lumpur', 'march', 'zaire', 'agree', 'limit', 'tin', 'export', '736', 'tonne', '12', 'month', 'march', 'line', 'association', 'tin', 'produce', 'country', '(', 'atpc', ')', 'plan', 'curb', 'export', 'atpc', 'say', 'atpc', 'executive', 'director', 'victor', 'siaahan', 'tell', 'reuter', 'receive', 'telex', 'zaire', 'indicate', 'willingess', 'take', 'part', 'plan', 'limit', 'total', 'atpc', 'export', '96', '000', 'tonne', 'year', 'march', 'siaahan', 'say', 'zaire', 'expect', 'produce']


#### 4. Building estimating model using naive bayes classifier method

In [None]:
# #* finding priors by:
#? (class count according to total classes)
#? then normalizing (0-1 scale)
priors = train_df['class'].value_counts(normalize=True)

#* finding the total count of each word across each class
vec = CountVectorizer()

def class_words(c):
  '''
  returns a dataframe of word counts for each class

  c: dataframe. columns: [`class`, `text`]
  '''
  counts = vec.fit_transform(c['text'])
  word_count = counts.sum(axis=0)
  return pd.DataFrame(word_count, columns=vec.get_feature_names_out())

df_word_count = train_df.groupby('class').apply(class_words) #? takes about 30 seconds

  df_word_count = train_df.groupby('class').apply(class_words) #? takes about 30 seconds


In [None]:
# #* creating a dictionary of words and their probabilities
scale = 1 #? scale-up numbers factor

word_prob_cache = {}

def word_prob(word, c):
  if (word, c) in word_prob_cache:
    return word_prob_cache[(word, c)]

  try:
    word_sum = df_word_count[word][c].sum()
  except:
    word_sum = 0
  try:
    all_sum = df_word_count[word].sum()
  except:
    all_sum = all_words
  res = (word_sum + 1) * scale / (all_sum + all_vocab)

  word_prob_cache[(word, c)] = res
  return res

def prop_sentence(sentence, c):
  '''
  returns the probability of a sentence given a class
  
  sentence: str. sentence to find probability
  c: str. class to find probability
  '''
  words = sentence.split(' ')
  prob = 1
  for word in words:
    prob *= word_prob(word, c)
  return prob * priors[c]

def max_class(sentence)-> str:
  '''
  returns the class with the highest probability given a sentence
  
  sentence: str. sentence to find probability
  '''
  mod_data = sentence.replace('\n', ' ').lower() #? lowercase here
  split_mod_data = re.split('[ ,.\'\"><]+', mod_data)
  filtered_data = ' '.join([w for w in split_mod_data if (not w in stop_words) and (len(w) > 1)])
  new_tokens = tkn(filtered_data)
  new_sent = ' '.join([word.lemma for word in new_tokens.sentences[0].words])
  probs = {}
  for c in train_df['class'].unique():
    probs[c] = prop_sentence(new_sent, c)
  max_class = max(probs, key=probs.get)
  return max_class

#? sample output
sentence = 'The president is a great leader'
max_class(sentence)

'acq'

#### 5. Model evaluation: Macro-averaged mean score

In [None]:
 #* new test data
all_test = os.listdir('data/test')

test_classes = {}
for i in range(len(all_test)):
  if all_test[i] != '.DS_Store':
    test_classes[all_test[i]] = {
      'files' : os.listdir(f'data/test/{all_test[i]}'),
    }

all_test = []
for c in test_classes:
  for idx, f in enumerate(test_classes[c]['files']):
    with open(f'data/test/{c}/{f}', 'r', encoding='latin') as file:
      filedata = file.read()
      mod_data = filedata.replace('\n', ' ').lower()#? lowercase here
      split_mod_data = re.split('[ ,.\'\"><]+', mod_data)
      filtered_data = ' '.join([w for w in split_mod_data if (not w in stop_words) and (len(w) > 1)])
      all_test.append({'class': c, 'text': filtered_data })

all_test_df = pd.DataFrame(all_test)

In [None]:
#* testing the model using small test_df
# stest_df = all_test_df.sample(500) #? takes about a minute to run
stest_df = all_test_df #? takes about a minute to run

In [None]:
#! really long time to run
stest_df['pred'] = stest_df['text'].apply(max_class)

In [None]:
# using naive bayes from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

#* creating a count vectorizer
new_cn_vec = CountVectorizer()
new_cn = new_cn_vec.fit_transform(train_df['text'])

#* splitting the data
X_train, X_test, y_train, y_test = train_test_split(new_cn, train_df['class'], test_size=0.001, random_state=42)

#* getting test data
X_test = new_cn_vec.transform(stest_df['text'])
y_test = stest_df['class']

#* fitting the model
nb = MultinomialNB()
nb.fit(X_train, y_train)

#* predicting the test data
y_pred = nb.predict(X_test)

#* accuracy and f1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {accuracy*100}%, F1: {f1*100}%')

Accuracy: 68.53876739562624%, F1: 13.396248702592306%


In [None]:
from sklearn.metrics import f1_score, accuracy_score

f1 = f1_score(stest_df['class'], stest_df['pred'], average='macro')
acc = accuracy_score(stest_df['class'], stest_df['pred'])

print(f'Mean Accuracy: {acc * 100}%')
print(f'F1 Score: {f1 * 100}%')
stest_df.sample(10)

Mean Accuracy: 46.29721669980119%
F1 Score: 3.774639744725782%


Unnamed: 0,class,text,pred
330,soybean,supply/demand detailed usda washington april a...,grain
2099,grain,china daily says vermin eat 7-12 pct grain sto...,unknown
3264,acq,gulf western gw ups interest network new york ...,acq
12,l-cattle,west virginia free two major cattle diseases w...,unknown
3922,crude,coastal raises crude oil postings 50 cts/bbl y...,crude
1400,earn,altus bank alts 3rd qtr net mobile ala oct 19 ...,earn
3531,acq,chase corp makes offer entregrowth wellington ...,tin
3214,acq,gander gndr buys western wear retailer wilmot ...,acq
1065,earn,dynamics research corp drco 1st qtr march 21 w...,earn
800,earn,westport bancorp webat 3rd qtr net westport co...,earn


score p1:
- 72.2
- 20.96

7.	EXTRA: think of new features that can be included into the Naïve Bayes Classifier, which contribute to improve the system performance.