# 1. DistilBERT

In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df10q = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df10q.dropna(inplace=True)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = torch.nn.DataParallel(model).to(device)

texts = df10q['MDA'].tolist()
texts = [text[63448:64948] for text in texts]

labels = df10q['target_3'].tolist()

batch_size = 80
embeddings = []
all_labels = []

for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lr_clf = LogisticRegression(max_iter=1000)
lr_clf.fit(X_train, y_train)

pred_labels = lr_clf.predict(X_test)

print(classification_report(y_test, pred_labels))

# 2. GPT-2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from transformers import GPT2Tokenizer, GPT2Model
import torch
from tqdm.notebook import tqdm 
import warnings

warnings.filterwarnings('ignore')


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df10q = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df10q.dropna(inplace=True)  

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', clean_up_tokenization_spaces=True)
model = GPT2Model.from_pretrained('gpt2')
model = torch.nn.DataParallel(model, device_ids=[0, 1]).to(device)

tokenizer.pad_token = tokenizer.eos_token

texts = df10q['MDA'].tolist()
texts = [text[54570:57570] for text in texts] 
labels = df10q['target_10'].tolist()


embeddings = []
all_labels = []

batch_size = 80
for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)
logistic_model = LogisticRegression(max_iter=3000, penalty='l2')  
logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

# 3. Finbert

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import os
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from sklearn.svm import SVC
import warnings

warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df10q = pd.read_csv('/kaggle/input/mda-reports/MDA_dataset_10K.csv', index_col=0)
df10q.dropna(inplace=True)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertModel.from_pretrained('yiyanghkust/finbert-tone')
model = torch.nn.DataParallel(model).to(device)

texts = df10q['MDA'].tolist()
texts = [text[64906:66406] for text in texts]

labels = df10q['target_3'].tolist()

batch_size = 80
embeddings = []
all_labels = []

for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
    batch_texts = texts[i:i + batch_size]
    batch_labels = labels[i:i + batch_size]

    tokens = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    embeddings.extend(batch_embeddings)
    all_labels.extend(batch_labels)

    del tokens, outputs
    torch.cuda.empty_cache()

X = np.array(embeddings)
y = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

svc=SVC() 
svc.fit(X_train, y_train)
y_pred=svc.predict(X_test)

print(classification_report(y_test, pred_labels))