# 1. BoW

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

df = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df.dropna(inplace=True)

lr = LogisticRegression(max_iter=1000, n_jobs=-1)

### 1.1. Unigrams

In [15]:
cv = CountVectorizer(stop_words='english', min_df=5)

X = df['MDA'].str[22500:24500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

lr.fit(X_train_counts, y_train)

y_train_pred = lr.predict(X_train_counts)
y_test_pred = lr.predict(X_test_counts)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       354
         1.0       1.00      1.00      1.00       359

    accuracy                           1.00       713
   macro avg       1.00      1.00      1.00       713
weighted avg       1.00      1.00      1.00       713

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.57      0.57      0.57       114
         1.0       0.60      0.60      0.60       124

    accuracy                           0.59       238
   macro avg       0.59      0.59      0.59       238
weighted avg       0.59      0.59      0.59       238



### 1.2. Bigrams

In [16]:
cv = CountVectorizer(stop_words='english', min_df=5, ngram_range=(2, 2))

X = df['MDA'].str[45500:48500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

lr.fit(X_train_counts, y_train)

y_train_pred = lr.predict(X_train_counts)
y_test_pred = lr.predict(X_test_counts)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       337
         1.0       1.00      0.99      1.00       351

    accuracy                           1.00       688
   macro avg       1.00      1.00      1.00       688
weighted avg       1.00      1.00      1.00       688

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.61      0.59      0.60       121
         1.0       0.56      0.59      0.57       109

    accuracy                           0.59       230
   macro avg       0.59      0.59      0.59       230
weighted avg       0.59      0.59      0.59       230



### 1.3. Trigrams

In [17]:
cv = CountVectorizer(stop_words='english', min_df=5, ngram_range=(3, 3))

X = df['MDA'].str[56500:59500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train_counts = cv.fit_transform(X_train)
X_test_counts = cv.transform(X_test)

lr.fit(X_train_counts, y_train)

y_train_pred = lr.predict(X_train_counts)
y_test_pred = lr.predict(X_test_counts)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88       322
         1.0       0.92      0.84      0.88       345

    accuracy                           0.88       667
   macro avg       0.88      0.88      0.88       667
weighted avg       0.89      0.88      0.88       667

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.62      0.66      0.64       122
         1.0       0.55      0.51      0.53       101

    accuracy                           0.59       223
   macro avg       0.59      0.59      0.59       223
weighted avg       0.59      0.59      0.59       223



# 2. TD-IDF

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

df = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df.dropna(inplace=True)

### 2.1. Unigrams

In [19]:
X = df['MDA'].str[500:3500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=1.0, ngram_range=(1, 1))
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000, penalty='l2')
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.80      0.79      0.79       368
         1.0       0.79      0.80      0.80       368

    accuracy                           0.79       736
   macro avg       0.79      0.79      0.79       736
weighted avg       0.79      0.79      0.79       736

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.57      0.58      0.58       118
         1.0       0.61      0.59      0.60       128

    accuracy                           0.59       246
   macro avg       0.59      0.59      0.59       246
weighted avg       0.59      0.59      0.59       246



### 2.1. Bigramms

In [9]:
X = df['MDA'].str[60500:66500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=1.0, ngram_range=(1, 2))
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.88      0.87      0.87       324
         1.0       0.87      0.88      0.88       329

    accuracy                           0.87       653
   macro avg       0.87      0.87      0.87       653
weighted avg       0.87      0.87      0.87       653

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.59      0.54      0.57       114
         1.0       0.54      0.59      0.56       104

    accuracy                           0.56       218
   macro avg       0.57      0.57      0.56       218
weighted avg       0.57      0.56      0.56       218



### 2.3. Trigrams

In [10]:
X = df['MDA'].str[500:4500]
y = df['target_3']
mask = X.str.len() > 0
X = X[mask]
y = y[mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=1.0, ngram_range=(1, 3))
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

lr = LogisticRegression(max_iter=1000, penalty='l2', n_jobs=-1)
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.79      0.81      0.80       368
         1.0       0.80      0.79      0.80       368

    accuracy                           0.80       736
   macro avg       0.80      0.80      0.80       736
weighted avg       0.80      0.80      0.80       736

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.57      0.63      0.60       118
         1.0       0.62      0.56      0.59       128

    accuracy                           0.59       246
   macro avg       0.59      0.59      0.59       246
weighted avg       0.60      0.59      0.59       246



# 3. DistilBERT

In [2]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df10q = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df10q.dropna(inplace=True)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = torch.nn.DataParallel(model).to(device)

texts = df10q['MDA'].tolist()
texts = [text[42080:47080] for text in texts]

labels = df10q['target_10_index'].tolist()

batch_size = 80
embeddings = []
all_labels = []

for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)

pred_labels = lr_clf.predict(X_test)
y_train_pred = lr_clf.predict(X_train)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, pred_labels))

Processing batches:   0%|          | 0/15 [00:00<?, ?it/s]

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.76      0.54      0.63       402
         1.0       0.67      0.85      0.75       448

    accuracy                           0.70       850
   macro avg       0.71      0.69      0.69       850
weighted avg       0.71      0.70      0.69       850

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.61      0.46      0.52       136
         1.0       0.59      0.73      0.65       148

    accuracy                           0.60       284
   macro avg       0.60      0.59      0.59       284
weighted avg       0.60      0.60      0.59       284



# 4. GPT-2

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import GPT2Tokenizer, GPT2Model
import torch
from tqdm.notebook import tqdm 
import warnings

warnings.filterwarnings('ignore')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df10q = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df10q.dropna(inplace=True)  

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', clean_up_tokenization_spaces=True)
model = GPT2Model.from_pretrained('gpt2')
model = torch.nn.DataParallel(model, device_ids=[0, 1]).to(device)

tokenizer.pad_token = tokenizer.eos_token

texts = df10q['MDA'].tolist()
texts = [text[7500:10500] for text in texts] 
labels = df10q['target_1_index'].tolist()


embeddings = []
all_labels = []

batch_size = 80
for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)
logistic_model = LogisticRegression(max_iter=3000, penalty='l1', solver='liblinear', C=3.0)  
logistic_model.fit(X_train, y_train)

y_test_pred = logistic_model.predict(X_test)
y_train_pred = logistic_model.predict(X_train)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Processing batches:   0%|          | 0/15 [00:00<?, ?it/s]

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74       439
         1.0       0.72      0.75      0.74       411

    accuracy                           0.74       850
   macro avg       0.74      0.74      0.74       850
weighted avg       0.74      0.74      0.74       850

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.58      0.57      0.57       139
         1.0       0.59      0.61      0.60       145

    accuracy                           0.59       284
   macro avg       0.59      0.59      0.59       284
weighted avg       0.59      0.59      0.59       284



# 4. Finbert

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from sklearn.svm import SVC
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df.dropna(inplace=True)

tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertModel.from_pretrained('ProsusAI/finbert')
model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

texts = df['MDA'].tolist()
texts = [text[46000:51000] for text in texts]

labels = df['target_10_index'].tolist()

batch_size = 80
embeddings = []
all_labels = []

for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lr = LogisticRegression(max_iter=3000, penalty='l2')  
lr.fit(X_train, y_train)

y_test_pred=lr.predict(X_test)
y_train_pred=lr.predict(X_train)

print("Accuracy (train):")
print(classification_report(y_train, y_train_pred))
print("Accuracy (test):")
print(classification_report(y_test, y_test_pred))

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Processing batches:   0%|          | 0/15 [00:00<?, ?it/s]

Accuracy (train):
              precision    recall  f1-score   support

         0.0       0.80      0.60      0.69       402
         1.0       0.71      0.86      0.78       448

    accuracy                           0.74       850
   macro avg       0.75      0.73      0.73       850
weighted avg       0.75      0.74      0.73       850

Accuracy (test):
              precision    recall  f1-score   support

         0.0       0.59      0.47      0.52       136
         1.0       0.59      0.70      0.64       148

    accuracy                           0.59       284
   macro avg       0.59      0.59      0.58       284
weighted avg       0.59      0.59      0.59       284

