#### 1. setting up packages and loading data 

In [52]:
import stanza
import nltk
from nltk.corpus import stopwords
import spacy
import gensim.downloader as api


stanza.download('en')
nltk.download('punkt')
nltk.download('stopwords')

word2vec = api.load('word2vec-google-news-300') #? 3M vocab - 1.66GB
fastText = api.load('fasttext-wiki-news-subwords-300') #? 1M vocab - 0.96GB 
glove = api.load('glove-wiki-gigaword-300') #? 400K vocab - 0.37GB

spacy_nlp = spacy.load('en_core_web_sm')
stanza_pipe = stanza.Pipeline(lang='en', processors='tokenize, lemma, mwt', use_gpu=False)
stop_words = set(stopwords.words('english'))

info = []
test_path,train_path = 'data/testing/', 'data/training/'

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-12 14:19:06 INFO: Downloaded file to C:\Users\omarm\stanza_resources\resources.json
2024-04-12 14:19:06 INFO: Downloading default packages for language: en (English) ...
2024-04-12 14:19:07 INFO: File exists: C:\Users\omarm\stanza_resources\en\default.zip
2024-04-12 14:19:09 INFO: Finished downloading models and saved to C:\Users\omarm\stanza_resources
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omarm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omarm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2024-04-12 14:22:38 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-04-12 14:22:39 INFO: Downloaded file to C:\Users\omarm\stanza_resources\resources.json
2024-04-12 14:22:39 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

2024-04-12 14:22:39 INFO: Using device: cpu
2024-04-12 14:22:39 INFO: Loading: tokenize
2024-04-12 14:22:39 INFO: Loading: mwt
2024-04-12 14:22:39 INFO: Loading: lemma
2024-04-12 14:22:39 INFO: Done loading processors!


#### 2. Prepare Data for Modeling

In [9]:
import os, re, time
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

def get_time():
  return time.ctime().split(' ')[3]

def debug_msg(str):
  print(f"{len(info)+1}- {get_time()} {str}")
  info.append(f"{len(info)+1}- {get_time()} {str}")

def get_classes(train_data, loc) -> dict[str, dict]:
  class_results = {}
  for i in range(len(train_data)):
    if train_data[i] != '.DS_Store':
      class_results[train_data[i]] = {
        'files' : os.listdir(f'{loc}{train_data[i]}'),
      }
  # debug_msg(f"number of {loc} classes: {len(class_results)}")
  return class_results

def filter_text(data) -> str:
  mod_data = data.replace('\n', ' ').lower()
  split_mod_data = re.split('[ ,.\'\"><]+', mod_data)
  filtered_data = ' '.join([w for w in split_mod_data if (not w in stop_words) and (len(w) > 1)])
  return filtered_data

def get_docs(classes, loc, limit_ratio=1):
  data = []
  for c in classes:
    if limit_ratio != 1:
      class_limit = len(classes[c]['files']) * limit_ratio
    for idx, f in enumerate(classes[c]['files']):
      with open(f'{loc}{c}/{f}', 'r', encoding='latin-1') as file:
        filtered_data = filter_text(file.read())
        data.append({'class': c, 'text': filtered_data })
      if limit_ratio != 1 and idx > class_limit:
        break
  if limit_ratio != 1:
    debug_msg(f"using {limit_ratio*100}% of {loc} data. number of documents: {len(data)}")
  else:
    debug_msg(f"using all of {loc} data. number of documents: {len(data)}")
  return data

def stanza_tokenizer(data) -> dict[list[str], dict]:
  debug_msg(f"START: tokenizing text using stanza...")

  stanza_docs = [stanza.Document([], text=d) for d in [t['text'] for t in data]]
  tkn_docs = stanza_pipe(stanza_docs) #? time consuming...
  tokens = [[word.lemma for word in sentence.words] for doc in tkn_docs for sentence in doc.sentences] 
  flat_tokens = [item for sublist in tokens for item in sublist]
  count_vec = vec.fit(flat_tokens).vocabulary_

  debug_msg(f"COMPLETED: tokenizing text using stanza ({len(flat_tokens)} tokens), vocab size: {len(count_vec)} tokens")
  return { 'tokens': flat_tokens, 'counts_vec': count_vec }

def spacy_tokenizer(data) -> dict[list[str], dict]:
  debug_msg(f"START: tokenizing text using spacy...")

  spacy_docs = [spacy_nlp(t['text']) for t in tqdm(data)] #? time consuming...
  tokens = [[token.text for token in doc] for doc in spacy_docs]
  flat_tokens = [item for sublist in tokens for item in sublist]
  count_vec = vec.fit(flat_tokens).vocabulary_

  debug_msg(f"COMPLETED: tokenizing text using spacy ({len(flat_tokens)} tokens), vocab size: {len(count_vec)} tokens")
  return {'tokens': flat_tokens, 'counts_vec': count_vec }

def get_lemma_docs(data) -> list[dict]:
  lemma_docs = []
  for d in tqdm(data):
    f_doc = filter_text(d['text'])
    lemmas = spacy_nlp(f_doc)
    lemma_doc = ' '.join([token.lemma_ for token in lemmas])
    lemma_docs.append({'class': d['class'], 'text': lemma_doc})

  debug_msg(f"COMPLETED: lemmatizing text")
  return lemma_docs

def docs_from_path(path, limit_per) -> pd.DataFrame:
  data_path = os.listdir(path)
  classes = get_classes(data_path, loc=path)
  docs = get_docs(classes, loc=path, limit_ratio=limit_per)
  # debug_msg(f"START: lemmatizing docs text...") 
  # lemma_docs = get_lemma_docs(docs) #* wasn't helpful

  return docs

#### Setting training and testing data

In [10]:
train = docs_from_path(train_path, limit_per=1)
test = docs_from_path(test_path, limit_per=1)

1- 13:27:08 using all of data/training/ data. number of documents: 11413
2- 13:27:09 using all of data/testing/ data. number of documents: 4024


#### Model 1: Naive Bayes [from scratch]

In [18]:
from sklearn.metrics import f1_score, accuracy_score

def naive_bayes(train_docs, test_docs):
  debug_msg(f"START: training naive bayes model...")
  train_df = pd.DataFrame(train_docs)
  tokens_data = spacy_tokenizer(train_docs) #* spacy tokenizer
  # tokens_data = stanza_tokenizer(train_docs) #* stanza tokenizer
  tokens_size, vocab_size = len(tokens_data['tokens']), len(tokens_data['counts_vec'])
  word_prob_cache = {}
  
  def class_words(c):
    counts = vec.fit_transform(c['text'])
    word_count = counts.sum(axis=0)
    df = pd.DataFrame(word_count, columns=vec.get_feature_names_out())
    return df

  def word_prob(word, c):
    if (word, c) in word_prob_cache:
      return word_prob_cache[(word, c)]
    try:
      word_sum = df_word_count[word][c].sum()
    except:
      word_sum = 0
    try:
      all_sum = df_word_count[word].sum()
    except:
      all_sum = tokens_size
    res = (word_sum + 1) / (all_sum + vocab_size)

    word_prob_cache[(word, c)] = res
    return res

  def prop_sentence(sentence, c):
    words = sentence.split(' ')
    prob = 1
    for word in words:
      prob *= word_prob(word, c)
    return prob * priors[c]

  def max_class(doc)-> str:
    new_doc = filter_text(doc)
    # tokens = spacy_nlp(filtered_doc) #* spacy tokenizer -1
    # new_doc = ' '.join([token.text for token in tokens]) #* spacy tokenizer -2
    
    # new_tokens = stanza_pipe(filtered_doc) #* stanza tokenizer -1
    # new_doc = ' '.join([word.lemma for word in new_tokens.sentences[0].words]) #* stanza tokenizer -2

    probs = {}
    for c in train_df['class'].unique():
      probs[c] = prop_sentence(new_doc, c)
    max_class = max(probs, key=probs.get)
    return max_class

  debug_msg(f"Preparing Naive Bayes Parameters...")
  priors = train_df['class'].value_counts(normalize=True)
  df_word_count = train_df.groupby('class').apply(class_words)
  
  debug_msg(f"Applying naive bayes model on Test Data...")
  result_df = pd.DataFrame(test_docs)
  tqdm.pandas()
  result_df['predicted'] = result_df['text'].progress_apply(max_class)
  
  debug_msg(f"COMPLETED: testing naive bayes model...")
  f1 = round(f1_score(result_df['class'], result_df['predicted'], average='macro')*100, 2)
  acc = round(accuracy_score(result_df['class'], result_df['predicted'])*100, 2)
  return { 'model': 'Manual NB', 'F1 Score': f1, 'Avg Accuracy': acc }

In [12]:
# print(f"sample class files: {all_classes['tin']}")
# print(f"train sample: {train[0]}")
# print(f"test sample: {test[0]}")
# print(f"stanza train sample: {len(stanza_data)}")
# print(f"spacy tokens: {spacy_data['tokens'][:10]},\nvocab: {spacy_data['vocab']}")

In [26]:
scratch_nb = naive_bayes(train, test)
scratch_nb

3- 02:48:58 START: training naive bayes model...
4- 02:48:58 START: tokenizing text using spacy...


100%|██████████| 11413/11413 [02:49<00:00, 67.40it/s]


5- 02:51:50 COMPLETED: tokenizing text using spacy (1414367 tokens), vocab size: 29578 tokens
6- 02:51:51 Preparing Naive Bayes Parameters...
7- 02:52:18 Applying naive bayes model on Test Data...


100%|██████████| 4024/4024 [04:27<00:00, 15.05it/s]


8- 02:56:45 COMPLETED: testing naive bayes model...


{'model': 'Manual NB', 'F1 Score': 3.63, 'Avg Accuracy': 48.56}

#### Model 1: Naive bayes [sklearn] using CountVectorizer

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

#* splitting the data
vec_fit = vec.fit_transform(train_df['text'])
X_train, X_test, y_train, y_test = train_test_split(vec_fit, train_df['class'], test_size=0.001, random_state=42)
X_test = vec.transform(test_df['text'])
y_test = test_df['class']

#* fitting the model
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

accuracy = round(accuracy_score(y_test, y_pred) *100, 2)
f1 = round(f1_score(y_test, y_pred, average='macro') *100, 2)

nb_results = { 'model': 'Sklearn NB', 'F1 Score': f1, 'Avg Accuracy': accuracy }
nb_results

{'model': 'Sklearn NB', 'F1 Score': 13.09, 'Avg Accuracy': 68.32}

#### Model 1: Naive bayes [sklearn] using TF-IDF

In [35]:
# trying tf-idf with naive bayes
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
vec_fit = tfidf.fit_transform(train_df['text'])

X_train, X_test, y_train, y_test = train_test_split(vec_fit, train_df['class'], test_size=0.001, random_state=1234)
X_test = tfidf.transform(test_df['text'])
y_test = test_df['class']

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

accuracy = round(accuracy_score(y_test, y_pred)*100, 2)
f1 = round(f1_score(y_test, y_pred, average='macro')*100, 2)

tfidf_nb_results = { 'model': 'TF-IDF NB', 'F1 Score': f1, 'Avg Accuracy': accuracy }
tfidf_nb_results

{'model': 'TF-IDF NB', 'F1 Score': 3.97, 'Avg Accuracy': 53.23}

#### ???

In [48]:
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)

In [55]:
from nltk.util import ngrams
from nltk import word_tokenize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

#* returns a sequence of n-gram strings from string as list
def gen_ngrams(text, grams=2) -> list:
  n_grams = ngrams(word_tokenize(text), grams)
  n_grams_list = [' '.join(g) for g in n_grams]
  return n_grams_list

#* returns a list of lemmatized tokens
def ngrams_prep(text, n=1) -> list:
  words = [w for w in word_tokenize(text.lower())]
  fn_grams = gen_ngrams(' '.join(words), n)
  # lemmas = [spacy_nlp(w)[0].lemma_ for w in fn_grams]
  return fn_grams

def docs_to_vecs(docs) -> list:
  docs_grams = [ngrams_prep(doc, n=1) for doc in docs]
  docs_vecs = []

  debug_msg(f"Preparing word vectors for {len(docs_grams)} docs...")
  for doc in tqdm(docs):
    words_vec = []
    for word in doc:
      if word in fastText:
        words_vec.append(fastText[word]) #* adding train word vector to the list

    if len(words_vec) == 0: #* if no word vector found, add a zero vector
      words_vec.append(np.zeros(300))
    docs_vecs.append(np.mean(words_vec, axis=0)) #* combine all word_vecs using mean between them
  return docs_vecs

X_train = docs_to_vecs(train_df['text'])
X_test = docs_to_vecs(test_df['text'])

y_train = train_df['class']
y_test = test_df['class']

classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

f1_score = round(f1_score(y_test, y_pred, average='macro')*100, 2)
accuracy = round(accuracy_score(y_test, y_pred)*100, 2)

logistic_results = { 'model': 'Logistic Regression', 'F1 Score': f1_score, 'Avg Accuracy': accuracy }
logistic_results

4- 14:54:57 Preparing word vectors for 11413 docs...


100%|██████████| 11413/11413 [00:08<00:00, 1364.21it/s]


5- 14:55:06 Preparing word vectors for 4024 docs...


100%|██████████| 4024/4024 [00:02<00:00, 1368.53it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'model': 'Logistic Regression', 'F1 Score': 1.57, 'Avg Accuracy': 39.49}

In [16]:
spacy = {
  'model': 'Manual NB', 'F1 Score': 3.57, 'Avg Accuracy': 50.67, 'time': '07:32', 'tokens': 29533,
}

spacy_exclamation = { 
  'model': 'Manual NB', 'F1 Score': 3.63, 'Avg Accuracy': 48.56
}

stanza = {
  'model': 'Manual NB', 'F1 Score': 3.48, 'Avg Accuracy': 49.65,'time': '10:09','tokens': 25123,
}

sklearn = {
  'model': 'Sklearn NB', 'F1 Score': 13.37, 'Avg Accuracy': 68.59, 'time': '00:01'
}

# applied lemmas to use text
sklearn_lemma_docs = {
  'model': 'Sklearn NB', 'F1 Score': 13.09, 'Avg Accuracy': 68.32
}

tfidf = {
  'model': 'TF-IDF NB', 'F1 Score': 4.08, 'Avg Accuracy': 54.10
}
# all these need re-running
{'model': 'Logistic Regression - Glove', 'F1 Score': 2.98, 'Avg Accuracy': 43.34}

{'model': 'Logistic Regression - w2v ', 'F1 Score': 1.91, 'Avg Accuracy': 40.95}

{'model': 'Logistic Regression - fasttext', 'F1 Score': 1.57, 'Avg Accuracy': 39.49}