In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load data
url = "/content/draft_50000 (1).csv"
df = pd.read_csv(url)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Function to remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_tokens)
    else:
        return ''



In [3]:
# Apply stopwords removal
df['text'] = df['text'].apply(remove_stopwords)



In [4]:
# BERT setup
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
# Tokenize and pad sequences
tokenized = df["text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))
padded = np.array([i + [0]*(512-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)


In [6]:
# Limit sequence length
max_length = 512
input_ids = torch.tensor(padded)[:, :max_length]
attention_mask = torch.tensor(attention_mask)[:, :max_length]

In [None]:
# Get BERT embeddings
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)[0][:,0,:].numpy()


In [None]:
# Split data for training
features = last_hidden_states
labels = df["class"]
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [None]:
# Train Decision Tree classifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

# Predict and evaluate
pred = dt_clf.predict(x_test)
print(classification_report(y_test, pred))