In [None]:
!pip install torch torchtext spacy transformers
!python -m spacy download en_core_web_sm
!pip install streamlit

# Install necessary libraries
!pip install transformers[torch] accelerate -U

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, confusion_matrix
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments

# Ensure necessary NLTK downloads
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk.corpus.stopwords.words('english'))

# Identify problematic rows
def identify_problematic_rows(file_path):
    problematic_rows = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            try:
                pd.DataFrame([row])
            except pd.errors.ParserError:
                problematic_rows.append(i)
    return problematic_rows

def remove_problematic_rows(file_path, problematic_rows):
    cleaned_data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            if i not in problematic_rows:
                cleaned_data.append(row)
    return cleaned_data

# Remove problematic rows and handle 'None' values in 'Author' column
def clean_dataset(file_path):
    problematic_rows = identify_problematic_rows(file_path)
    cleaned_data = remove_problematic_rows(file_path, problematic_rows)
    df = pd.DataFrame(cleaned_data[1:], columns=cleaned_data[0])
    df = df.dropna(subset=['Author'])  # Drop rows where 'Author' is None
    return df

# Preprocessing function
def preprocess_text(text):
    doc = nlp(text.lower())  # Convert text to lowercase
    tokens = [
        lemmatizer.lemmatize(token.text)
        for token in doc
        if token.is_alpha and token.text not in stopwords
    ]
    return ' '.join(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load and clean train and test datasets
train_dataframe = clean_dataset('/content/mega_train.csv')
test_dataframe = clean_dataset('/content/mega_test.csv')

# Preprocess text data
train_dataframe['text'] = train_dataframe['text'].apply(preprocess_text)
test_dataframe['text'] = test_dataframe['text'].apply(preprocess_text)

# Create mappings for authors
auth_sort = sorted(train_dataframe['Author'].unique())
dictOfAuthors = {author: idx for idx, author in enumerate(auth_sort)}
train_dataframe['Author_num'] = train_dataframe['Author'].map(dictOfAuthors)
test_dataframe['Author_num'] = test_dataframe['Author'].map(dictOfAuthors)

# Drop unnecessary columns
train_dataframe = train_dataframe.drop(columns=['Author', 'index'])
test_dataframe = test_dataframe.drop(columns=['Author', 'index', 'hopeful_test'])

In [None]:
# Tokenization and padding
max_length = 512
def tokenize_and_pad(texts, max_length):
    return [word_tokenize(text)[:max_length] + ['<PAD>'] * (max_length - len(word_tokenize(text))) if len(word_tokenize(text)) < max_length else word_tokenize(text)[:max_length] for text in texts]

train_texts = tokenize_and_pad(train_dataframe['text'].tolist(), max_length)
test_texts = tokenize_and_pad(test_dataframe['text'].tolist(), max_length)

# Build vocabulary
vocab = {word: idx for idx, word in enumerate(set([word for text in train_texts + test_texts for word in text]))}
vocab['<PAD>'] = 0

def texts_to_sequences(texts, vocab):
    return [[vocab[word] for word in text] for text in texts]

train_sequences = texts_to_sequences(train_texts, vocab)
test_sequences = texts_to_sequences(test_texts, vocab)

train_inputs = torch.tensor(train_sequences)
test_inputs = torch.tensor(test_sequences)

train_labels = torch.tensor(train_dataframe['Author_num'].values)
test_labels = torch.tensor(test_dataframe['Author_num'].values)

# Create DataLoader
train_data = TensorDataset(train_inputs, train_labels)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

test_data = TensorDataset(test_inputs, test_labels)
test_loader = DataLoader(test_data, batch_size=16)

In [None]:
# Ensure device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load pre-trained tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=len(auth_sort))
model = model.to(device)

# Prepare datasets
max_len = 512  # Maximum length for BART input
train_dataset = CustomTextDataset(train_dataframe['text'].tolist(), train_labels, tokenizer, max_len)
test_dataset = CustomTextDataset(test_dataframe['text'].tolist(), test_labels, tokenizer, max_len)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load pre-trained tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=len(auth_sort))
model = model.to(device)

# Prepare datasets
max_len = 128  # Maximum length for BART input
train_dataset = CustomTextDataset(train_dataframe['text'].tolist(), train_labels, tokenizer, max_len)
test_dataset = CustomTextDataset(test_dataframe['text'].tolist(), test_labels, tokenizer, max_len)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,  # Adjust epochs as needed
    per_device_train_batch_size=16,  # Adjust batch size as needed
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Trainer
# Trainer
def compute_metrics(p):
    logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = logits.argmax(-1)
    return {
        'accuracy': (preds == p.label_ids).mean(),
        'f1': f1_score(p.label_ids, preds, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
train_metrics = trainer.evaluate(eval_dataset=train_dataset)
test_metrics = trainer.evaluate(eval_dataset=test_dataset)

print(f"Train Loss: {train_metrics['eval_loss']:.4f}, Train Accuracy: {train_metrics['eval_accuracy']:.4f}, Train F1 Score: {train_metrics['eval_f1']:.4f}")
print(f"Test Loss: {test_metrics['eval_loss']:.4f}, Test Accuracy: {test_metrics['eval_accuracy']:.4f}, Test F1 Score: {test_metrics['eval_f1']:.4f}")




Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'labels': torch.tensor(label, dtype=torch.long)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1386,1.901442,0.522,0.470101
2,1.2595,1.066021,0.63,0.593851
3,1.1841,0.837442,0.721,0.703109
4,0.761,0.5959,0.782,0.76519
5,0.5163,0.44637,0.846,0.844736
6,0.4161,0.349108,0.899,0.899416
7,0.2292,0.284521,0.921,0.92127
8,0.172,0.23697,0.939,0.939054


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
  'labels': torch.tensor(label, dtype=torch.long)
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
  'labels': torch.tensor(label, dtype=torch.long)
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
  'labels': torch.tensor(label, dtype=torch.long)
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
  'labels': torch.tensor(label, dtype=torch.long)
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
  'labels': torch.tensor(

  'labels': torch.tensor(label, dtype=torch.long)


Train Loss: 0.1109, Train Accuracy: 0.9715, Train F1 Score: 0.9716
Test Loss: 0.2370, Test Accuracy: 0.9390, Test F1 Score: 0.9391


In [None]:
#mounted the drive to save the model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#link to hugging face hub
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import BartForSequenceClassification, BartTokenizer

# Paths to  model files in Google Drive
model_dir = "/content/drive/MyDrive/nlp-project/bart-model"

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_dir)
model = BartForSequenceClassification.from_pretrained(model_dir)

# Push to the Hugging Face model hub
model.push_to_hub("sajid227/nlp-project-author-identifcation")


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4', '5': 'LABEL_5', '6': 'LABEL_6', '7': 'LABEL_7', '8': 'LABEL_8', '9': 'LABEL_9', '10': 'LABEL_10', '11': 'LABEL_11', '12': 'LABEL_12', '13': 'LABEL_13', '14': 'LABEL_14', '15': 'LABEL_15', '16': 'LABEL_16', '17': 'LABEL_17', '18': 'LABEL_18', '19': 'LABEL_19', '20': 'LABEL_20', '21': 'LABEL_21', '22': 'LABEL_22', '23': 'LABEL_23', '24': 'LABEL_24', '25': 'LABEL_25', '26': 'LABEL_26', '27': 'LABEL_27', '28': 'LABEL_28', '29': 'LABEL_29', '30': 'LABEL_30', '31': 'LABEL_31', '32': 'LABEL_32', '33': 'LABEL_33', '34': 'LABEL_34', '35': 'LABEL_35', '36': 'LABEL_36', '37': 'LABEL_37', '38': 'LABEL_38', '39': 'LABEL_39', '40': 'LABEL_40', '41': 'LABEL_41', '42': 'LABEL_42', '43': 'LABEL_43', '44': 'LABEL_44', '45': 'LABEL_45', '46': 'LABEL_46', '47': 'LABEL_47', '48': 'LABEL_48', '49': 'LABEL_49'}. The number of labels wil be overwritten to 50.


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


CommitInfo(commit_url='https://huggingface.co/sajid227/nlp-project-author-identifcation/commit/97f5ffe92990d01bb005deea569e9df1c85ca49f', commit_message='Upload BartForSequenceClassification', commit_description='', oid='97f5ffe92990d01bb005deea569e9df1c85ca49f', pr_url=None, pr_revision=None, pr_num=None)