In [2]:
import os
import numpy as np
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
import os

In [3]:
train_data = './training'
test_data = './test'

# Data preprocessing

In [4]:
def get_stopwords():
    """ Return a set of English stopwords. """
    return set(stopwords.words('english'))

In [5]:
def tokenize(text):
    """
    Tokenize the given text.

    Parameters:
    - text (str): The text to tokenize.

    Returns:
    - tokens (list): A list of tokens.
    """
    return word_tokenize(text)

In [6]:
def initialize_stemmer():
    """ Return an initialized Porter Stemmer. """
    return PorterStemmer()

In [7]:
def filter_and_stem(tokens, stop_words, stemmer):
    """ 
    Filter non-alphabetic tokens, remove stopwords, and apply stemming.
    
    Parameters:
    - tokens (list): List of tokens.
    - stop_words (set): Set of stopwords.
    - stemmer (PorterStemmer): Initialized stemmer.

    Returns:
    - list: Filtered and stemmed tokens.
    """
    return [stemmer.stem(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]

In [8]:
def read_file(file_path):
    """ Read and return the content of the file at the given path. """
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()

In [9]:
def process_text(text, stop_words, stemmer):
    """
    Process the input text, tokenize, filter stopwords and apply stemming.

    Parameters:
    - text (str): Text to process.
    - stop_words (set): Set of stopwords.
    - stemmer (PorterStemmer): Initialized stemmer.

    Returns:
    - tuple: (raw tokens as string, filtered tokens as string)
    """
    tokens = tokenize(text)
    raw_tokens = ' '.join(tokens)
    filtered_tokens = ' '.join(filter_and_stem(tokens, stop_words, stemmer))
    return raw_tokens, filtered_tokens

In [10]:
def process_data(data_path):
    """
    Process data from a given data path by reading files and processing text.

    Parameters:
    - data_path (str): The path to the data directory.

    Returns:
    - tuple of lists: (raw_data_tokens, filtered_data_tokens, data_labels)
    """
    stop_words = get_stopwords()
    stemmer = initialize_stemmer()
    raw_data_tokens = []
    filtered_data_tokens = []
    data_labels = []

    for label in os.listdir(data_path):
        label_path = os.path.join(data_path, label)
        if os.path.isdir(label_path):
            for filename in os.listdir(label_path):
                file_path = os.path.join(label_path, filename)
                text = read_file(file_path)
                raw_tokens, filtered_tokens = process_text(text, stop_words, stemmer)
                raw_data_tokens.append(raw_tokens)
                filtered_data_tokens.append(filtered_tokens)
                data_labels.append(label)

    return raw_data_tokens, filtered_data_tokens, data_labels

* Split data and filterd it

In [11]:
train_raw_tokens, train_filtered_tokens, train_cls = process_data(train_data)
test_raw_tokens, test_filtered_tokens, test_cls = process_data(test_data)

In [12]:
print('Length Of Train tokens: ', len(train_filtered_tokens))
print('Length Of Train classes: ', len(train_cls),'\n')
print('Length Of Test tokens: ', len(test_filtered_tokens))
print('Length Of Test classes: ', len(test_cls))

Length Of Train tokens:  11413
Length Of Train classes:  11413 

Length Of Test tokens:  4024
Length Of Test classes:  4024


In [13]:
train_raw_tokens[:2]

["COMPUTER TERMINAL SYSTEMS < CPML > COMPLETES SALE COMMACK , N.Y. , Feb 26 - Computer Terminal Systems Inc said it has completed the sale of 200,000 shares of its common stock , and warrants to acquire an additional one mln shares , to < Sedio N.V. > of Lugano , Switzerland for 50,000 dlrs . The company said the warrants are exercisable for five years at a purchase price of .125 dlrs per share . Computer Terminal said Sedio also has the right to buy additional shares and increase its total holdings up to 40 pct of the Computer Terminal 's outstanding common stock under certain circumstances involving change of control at the company . The company said if the conditions occur the warrants would be exercisable at a price equal to 75 pct of its common stock's market price at the time , not to exceed 1.50 dlrs per share . Computer Terminal also said it sold the technolgy rights to its Dot Matrix impact technology , including any future improvements , to < Woodco Inc > of Houston , Tex . f

In [14]:
train_filtered_tokens[:2]

['comput termin system cpml complet sale commack feb comput termin system inc said complet sale share common stock warrant acquir addit one mln share sedio lugano switzerland dlr compani said warrant exercis five year purchas price dlr per share comput termin said sedio also right buy addit share increas total hold pct comput termin outstand common stock certain circumst involv chang control compani compani said condit occur warrant would exercis price equal pct common market price time exceed dlr per share comput termin also said sold technolgi right dot matrix impact technolog includ futur improv woodco inc houston tex dlr said would continu exclus worldwid license technolog woodco compani said move part reorgan plan would help pay current oper cost ensur product deliveri comput termin make comput gener label form tag ticket printer termin',
 'ohio mattress omt may lower qtr net cleveland feb ohio mattress co said first quarter end februari profit may mln dlr ct share earn first quar

In [15]:
test_raw_tokens[:2]

["SUMITOMO BANK AIMS AT QUICK RECOVERY FROM MERGER < AUTHOR > By Linda Sieg , Reuters < /AUTHOR > TOKYO , April 8 - Sumitomo Bank Ltd < SUMI.T > is certain to lose its status as Japan 's most profitable bank as a result of its merger with the Heiwa Sogo Bank , financial analysts said . Osaka-based Sumitomo , with desposits of around 23.9 trillion yen , merged with Heiwa Sogo , a small , struggling bank with an estimated 1.29 billion dlrs in unrecoverable loans , in October . But despite the link-up , Sumitomo President Koh Komatsu told Reuters he is confident his bank can quickly regain its position . `` We 'll be back in position in first place within three years , '' Komatsu said in an interview . He said that while the merger will initially reduce Sumitomo 's profitability and efficiency , it will vastly expand Sumitomo 's branch network in the Tokyo metropolitan area where it has been relatively weak . But financial analysts are divided on whether and how quickly the gamble will pa

In [16]:
test_filtered_tokens[:2]

['sumitomo bank aim quick recoveri merger author linda sieg reuter tokyo april sumitomo bank ltd certain lose statu japan profit bank result merger heiwa sogo bank financi analyst said sumitomo desposit around trillion yen merg heiwa sogo small struggl bank estim billion dlr unrecover loan octob despit sumitomo presid koh komatsu told reuter confid bank quickli regain posit back posit first place within three year komatsu said interview said merger initi reduc sumitomo profit effici vastli expand sumitomo branch network tokyo metropolitan area rel weak financi analyst divid whether quickli gambl pay said sumitomo may paid much heiwa sogo view smaller bank larg debt other argu merger cost effect creat compar branch network scratch analyst agre bank aggress expand oversea enter lucr secur busi gear domest competit question wisdom move made bold move put everyth place larg hand said kleinwort benson ltd financi analyst simon smithson among sumitomo problem limit place move enter secur bus

# Label Encoding

* `LabelEncoder()` it's a process to converting categorical labels to numeerical values to use it in (SVM, NB, RF) algorithms.

In [17]:
# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Encode training labels
y_train = label_encoder.fit_transform(train_cls)

# Encode testing labels
y_test = label_encoder.transform(test_cls)

# Loading Word Embedding Models

* `gensim` Loading word embedding models `Word2Vec` , `Glove` from this library by api.

In [18]:
glove_model = api.load('glove-wiki-gigaword-300')

In [19]:
word2vec_model = api.load('word2vec-google-news-300')

In [20]:
def embed_text_average(texts, model):
    """
    Averages word vectors from the provided model for each text in a list.
    
    Parameters:
    texts (list of str): List of text documents.
    model (gensim models): Word embedding model with 'vector_size' and 'key_to_index'.
    
    Returns:
    numpy.ndarray: Array of averaged vectors. Returns zero vector for texts without known words.
    """
    
    vectors = []
    for text in texts:
        vector = np.zeros(model.vector_size)
        for word in text.split():
            if word in model.key_to_index:
                vector += model[word]
        vectors.append(vector / len(text.split()))
    return np.array(vectors)

# Classifier Evaluation - Word2Vec

* Final Result: The top one of results it's `Naive Bayes` with `0.22` F1-Score.

* Stepes I try to get greatest value with this algorithms:
    - RandomForest: 
         * First time with n_estimators = 100 and max_depth= 10. its give: `0.098`
        * Second time with n_estimators = 500 and max_depth = 20. it's given: `0.152`
        * Third time with n_estimators = 1000 and max_depth = 50. it's given: `0.158`

    - Support Vector Machine (SVM):
        * First time with kernal = `linear` its give: `0.1621`
        * Second time with kernal = `sigmoid` its give: `0.1154`
        * Third time with kernal = `rbf` its give: `0.18384`
        * Forth time with kernal = `rbf` but with `Standard Sacler` its give: `0.2139`
    - Naive Bayes:
        * `0.22`

In [21]:
X_train = embed_text_average(train_filtered_tokens, word2vec_model)

In [22]:
X_test = embed_text_average(test_filtered_tokens, word2vec_model)

In [23]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=50, random_state=42, n_jobs=-1)

In [24]:
# Train the RandomForest model
rf_model.fit(X_train, y_train)

In [25]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [26]:
rf_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Random Forest - Word2vec: {rf_f1}")

Random Forest - Word2vec: 0.1521464581963259


* First time with n_estimators = 100 and max_depth= 10. its give:` 0.098`
* Second time with n_estimators = 500 and max_depth = 20. it's given: `0.152`
* Third time with n_estimators = 1000 and max_depth = 50. it's given: `0.158`

In [27]:
nb_model = GaussianNB()

In [28]:
nb_model.fit(X_train, y_train)

In [29]:
y_pred = nb_model.predict(X_test)

In [30]:
nb_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Naive Bayes - Word2vec: {nb_f1}")

Naive Bayes - Word2vec: 0.22256349906474424


In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Initialize the SVM classifier
# Common kernels are 'linear', 'poly', 'rbf', 'sigmoid'
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model on your training data
svm_model.fit(X_train_scaled, y_train)

In [33]:
# Predict on the test set
y_pred = svm_model.predict(X_test_scaled)

In [34]:
SVM_f1 = f1_score(y_test, y_pred, average='macro')
print(f"SVM - Word2vec: {SVM_f1}")

SVM - Word2vec: 0.21393366665721245


* First time with kernal = `linear` its give: `0.1621`
* Second time with kernal = `sigmoid` its give: `0.1154`
* Third time with kernal = `rbf` its give: `0.18384`
* Forth time with kernal = `rbf` but with `Standard Sacler` its give: `0.2139`

# Classifier Evaluation - Glove

* Final Result: The top one of results it's `SVM` with `0.3146` F1-Score.

* Stepes I try to get greatest value with this algorithms:
    - RandomForest: 
        * First time with n_estimators = 100 and max_depth= 10. its give:` 0.090`
        * Second time with n_estimators = 500 and max_depth = 20. it's given: `0.172`
        * Third time with n_estimators = 800 and max_depth = 20. it's given: `0.161`
        * Third time with n_estimators = 1000 and max_depth = 30. it's given: `0.15889`

    - Support Vector Machine (`SVM`):
        * First time with kernal = `linear` its give: `0.2680`
        * Second time with kernal = `linear` but with `StandardScaler` its give: `0.3146`
        * Third time with kernal = `sigmoid` its give: `0.1236`
        * Forth time with kernal = `sigmoid`  but with `StandardScaler` its give: `0.1845`
        * Fifth time with kernal = `rbf` its give: `0.1863`
        * Sixth time with kernal = `rbf` but with `Standard Sacler` its give: `0.2208`
    - Naive Bayes:
        * For first time Naive Bayes: `0.262`
        * For Second time with `StandardScaler` Naive Bayes: `0.256`

In [35]:
X_train = embed_text_average(train_filtered_tokens, glove_model)
X_test = embed_text_average(test_filtered_tokens, glove_model)

In [36]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=30, random_state=42, n_jobs=-1)

In [38]:
# Train the RandomForest model
rf_model.fit(X_train, y_train)

In [39]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [40]:
rf_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Random Forest - Glove: {rf_f1}")

Random Forest - Glove: 0.15889871563937982


* First time with n_estimators = 100 and max_depth= 10. its give: `0.090`
* Second time with n_estimators = 500 and max_depth = 20. it's given: `0.172`
* Third time with n_estimators = 800 and max_depth = 20. it's given: `0.161`
* Third time with n_estimators = 1000 and max_depth = 30. it's given: `0.15889`

In [41]:
# Initialize the SVM classifier
# Common kernels are 'linear', 'poly', 'rbf', 'sigmoid'
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model on your training data
svm_model.fit(X_train_scaled, y_train)

In [42]:
# Predict on the test set
y_pred = svm_model.predict(X_test_scaled)

* First time with kernal = `linear` its give: `0.2680`
* Second time with kernal = `linear` but with `StandardScaler` its give: `0.3146`
* Third time with kernal = `sigmoid` its give: `0.1236`
* Forth time with kernal = `sigmoid`  but with `StandardScaler` its give: `0.1845`
* Fifth time with kernal = `rbf` its give: `0.1863`
* Sixth time with kernal = `rbf` but with `Standard Sacler` its give: `0.2208`

In [43]:
nb_model = GaussianNB()

In [44]:
nb_model.fit(X_train_scaled, y_train)

In [45]:
y_pred = nb_model.predict(X_test_scaled)

In [46]:
nb_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Naive Bayes - Glove: {nb_f1}")

Naive Bayes - Glove: 0.2563162721118958


* For first time Naive Bayes: `0.262`
* For Second time with `StandardScaler` Naive Bayes: `0.256`

# TF-IDF Approach

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
# vectorizer = TfidfVectorizer()

# # Fit and transform the training data to compute TF-IDF
# X_train_tfidf = vectorizer.fit_transform(train_filtered_tokens)

# # Transform the testing data to TF-IDF
# X_test_tfidf = vectorizer.transform(test_filtered_tokens)

vectorizer = TfidfVectorizer(max_df=0.5, min_df=3, max_features=1000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(train_filtered_tokens)  
X_test_tfidf = vectorizer.transform(test_filtered_tokens)       

In [48]:
# Initialize Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

In [49]:
# Predict on the test set
y_pred_nb = nb_classifier.predict(X_test_tfidf)

In [50]:
# Calculate F1 Score
f1_nb = f1_score(y_test, y_pred_nb, average='macro')
print(f"Naive Bayes F1 Score: {f1_nb}")

Naive Bayes F1 Score: 0.12096221539785486


In [51]:
# Naive Bayes F1 Score: 0.04291959056715726
# Naive Bayes F1 Score: 0.12096221539785486 with bi-gram
# Naive Bayes F1 Score: 0.11715903377777519 with trigram

In [52]:
# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=20, random_state=42, n_jobs=-1)

In [53]:
# Train the RandomForest model
rf_model.fit(X_train_tfidf, y_train)

In [54]:
# Predict on the test set
y_pred = rf_model.predict(X_test_tfidf)

In [55]:
rf_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Random Forest - TF-IDF: {rf_f1}")

Random Forest - TF-IDF: 0.12321199017150293


In [56]:
# Random Forest - TF-IDF: 0.0414273867832357
# Random Forest - TF-IDF: 0.12435263570666374
# Random Forest - TF-IDF: 0.12478155480786686
# Random Forest - TF-IDF: 0.12321199017150293

In [57]:
# Initialize the SVM classifier
svm_model = SVC(kernel='rbf', random_state=42)

# Train the model on your training data
svm_model.fit(X_train_tfidf, y_train)

In [58]:
# Predict on the test set
y_pred = svm_model.predict(X_test_tfidf)

In [59]:
SVM_f1 = f1_score(y_test, y_pred, average='macro')
print(f"SVM - TF-IDF: {SVM_f1}")

SVM - TF-IDF: 0.2360962733631947


In [60]:
# SVM - TF-IDF: 0.25990501069688515 - Linear
# SVM - TF-IDF: 0.2485707503239959 - Sigmoid
# SVM - TF-IDF: 0.2360962733631947 - rbf

# LSTM

In [61]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_cls)
y_train = to_categorical(encoded_train_labels)

encoded_test_labels = label_encoder.transform(test_cls)
y_test = to_categorical(encoded_test_labels)

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_filtered_tokens)

X_train_sequences = tokenizer.texts_to_sequences(train_filtered_tokens)
X_test_sequences = tokenizer.texts_to_sequences(test_filtered_tokens)

X_train_padded = pad_sequences(X_train_sequences, maxlen=100)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100)

model = Sequential([
    Embedding(input_dim=5000, output_dim=128),
    SpatialDropout1D(0.3),
    LSTM(100, dropout=0.3, recurrent_dropout=0.3),
    Dense(units=len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train_padded, y_train, batch_size=64, epochs=5, validation_split=0.1)

predictions = model.predict(X_test_padded)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

f1_macro = f1_score(true_classes, predicted_classes, average='macro')
print(f"Macro-averaged F1-Score: {f1_macro}")


Epoch 1/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 129ms/step - accuracy: 0.2678 - loss: 3.2749 - val_accuracy: 8.7566e-04 - val_loss: 3.9405
Epoch 2/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 142ms/step - accuracy: 0.4216 - loss: 2.2259 - val_accuracy: 0.3888 - val_loss: 3.3542
Epoch 3/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 130ms/step - accuracy: 0.5419 - loss: 1.8368 - val_accuracy: 0.4536 - val_loss: 3.4536
Epoch 4/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 122ms/step - accuracy: 0.6161 - loss: 1.5433 - val_accuracy: 0.4834 - val_loss: 3.7412
Epoch 5/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 123ms/step - accuracy: 0.6438 - loss: 1.3653 - val_accuracy: 0.4912 - val_loss: 3.8267
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step
Macro-averaged F1-Score: 0.06100943701994586
