# NLP Project

# Imports

In [34]:
from transformers import pipeline
import torch
import pandas as pd
from transformers import XLMRobertaTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

# Data Preprocessing

In [35]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from urllib.parse import urlparse

def extract_label_from_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    label = path.split('/')[-1] or path.split('/')[-2]  # Use the last non-empty part
    return label

def scrape_headlines_selenium(url, num_headlines=100):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(url)
        print(f"Page title: {driver.title}")

        headlines = []
        while len(headlines) < num_headlines:
            # Wait for articles to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "article"))
            )

            # Find all article elements
            articles = driver.find_elements(By.TAG_NAME, "article")

            # Extract headlines from new articles
            for article in articles[len(headlines):]:
                try:
                    headline_element = article.find_element(By.CSS_SELECTOR, "h3, h2, .headline")
                    headline = headline_element.text.strip()
                    if headline not in headlines:
                        headlines.append(headline)
                        print(f"Found headline: {headline}")
                except Exception as e:
                    print(f"Error extracting headline: {e}")

            print(f"Total headlines found: {len(headlines)}")

            if len(headlines) >= num_headlines:
                break

            # Try to click 'View More' button
            try:
                view_more_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, "//*[@id='viewMoreButton']"))
                )
                driver.execute_script("arguments[0].scrollIntoView();", view_more_button)
                driver.execute_script("arguments[0].click();", view_more_button)
                print("Clicked 'View More' button")
                time.sleep(2)  # Wait for new content to load
            except (TimeoutException, NoSuchElementException, ElementClickInterceptedException) as e:
                print(f"Could not click 'View More' button: {e}")
                # Try alternative XPath if the ID-based one fails
                try:
                    view_more_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div/div/div[2]/main/div[3]/div/div/div[3]/button"))
                    )
                    driver.execute_script("arguments[0].scrollIntoView();", view_more_button)
                    driver.execute_script("arguments[0].click();", view_more_button)
                    print("Clicked 'View More' button using alternative XPath")
                    time.sleep(2)
                except Exception as e:
                    print(f"Could not click 'View More' button using alternative XPath: {e}")
                    print("No more 'View More' button found or it's not clickable. Ending search.")
                    break

        return headlines[:num_headlines]

    finally:
        driver.quit()

# URL of the webpage
url = 'https://www.isolezwe.co.za/impilo-yabantu/ezezimoto'

print("Attempting to scrape with Selenium:")
headlines = scrape_headlines_selenium(url)

# Extract label from URL
label = extract_label_from_url(url)

# Create DataFrame with headlines and label
df = pd.DataFrame({
    'headlines': headlines,
    'label': [label] * len(headlines)
})

# Save the headlines to a CSV file
df.to_csv('headlines.csv', index=False)

print(f"\n{len(headlines)} Headlines saved to headlines.csv")
print(f"Label used: {label}")

# Print the first few headlines for verification
print("\nFirst few headlines:")
for headline in headlines[:5]:
    print(f"{headline} (Label: {label})")

# Print the last few headlines for verification
print("\nLast few headlines:")
for headline in headlines[-5:]:
    print(f"{headline} (Label: {label})")

Attempting to scrape with Selenium:
Page title: Ezezimoto | Impilo yabantu | Isolezwe
Found headline: Bayifuna kuleli i-Formula One
Found headline: IMahindra igubha ewu-20 ngeSUV entsha i-3XO
Found headline: Okumele ukuqaphele uma uthenga imoto esidlangaleni
Found headline: Bafuna ukushubisa umbango ngeFord Tourneo elungele umndeni nebhizinisi
Found headline: Bekushunqa uthuli kwiZwartkops Top of the Hill Challenge
Found headline: Kwengezwe injini eyongayo kadizili kwi-Audi Q3
Found headline: Kwehle izinga lokuthengwa kwezimoto ezintsha
Found headline: ULucas Radebe useyinxusa lakwaVW
Found headline: Nentatheli yeSolezwe izoba inukisa ithayi eZwartkops
Found headline: Ngeke umthinte ngesipanela oweMumbo Repairs
Found headline: Ziqalile ukudayisa kuleli ezakwaJetour yaseChina
Found headline: IHyundai idoba abathengi ngeSUV entsha i-Exter
Found headline: Balamanise ngeSUV entsha i-Emkoo kwaGAC Motor
Found headline: Amathiphu angakusiza uma uthenga imoto eyisekeni
Found headline: Ziyinyam

# Language Model

# General Fine Tuning

In [36]:
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load your Zulu headlines dataset
df = pd.read_csv('headlines.csv')

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(df)

# Label encoding the text labels (e.g., food, cars, money)
label_encoder = LabelEncoder()
dataset = dataset.map(lambda e: {'label': label_encoder.fit_transform([e['label']])[0]})


# Step 1: Check GPU availability
print(torch.cuda.is_available())  # Should return True if the GPU is accessible
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use the GPU if available
    print(torch.cuda.get_device_name(0))  # Should print the GPU name
else:
    device = torch.device("cpu")  # Use CPU if no GPU is available

model_name = 'FacebookAI/xlm-roberta-base'

# Load the XLM-RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['headlines'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch (input tensors for training)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the instantiated model to be trained
    args=training_args,                  # training arguments
    train_dataset=tokenized_datasets,    # training dataset
    eval_dataset=tokenized_datasets,     # evaluation dataset
)

# Train the model
trainer.train()



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

True
NVIDIA RTX A1000 6GB Laptop GPU


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  0%|          | 0/39 [00:00<?, ?it/s]

{'train_runtime': 1109.6392, 'train_samples_per_second': 0.27, 'train_steps_per_second': 0.035, 'train_loss': 1.014050801595052, 'epoch': 3.0}


TrainOutput(global_step=39, training_loss=1.014050801595052, metrics={'train_runtime': 1109.6392, 'train_samples_per_second': 0.27, 'train_steps_per_second': 0.035, 'total_flos': 78934025318400.0, 'train_loss': 1.014050801595052, 'epoch': 3.0})

In [38]:
# Save the trained model
model.save_pretrained('./fine-tuned-xlm-roberta-mlm')
tokenizer.save_pretrained('./fine-tuned-xlm-roberta-mlm')

('./fine-tuned-xlm-roberta-mlm\\tokenizer_config.json',
 './fine-tuned-xlm-roberta-mlm\\special_tokens_map.json',
 './fine-tuned-xlm-roberta-mlm\\sentencepiece.bpe.model',
 './fine-tuned-xlm-roberta-mlm\\added_tokens.json',
 './fine-tuned-xlm-roberta-mlm\\tokenizer.json')

In [39]:
# Evaluate the model
results = trainer.evaluate()

# Print the evaluation results
print(results)


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.8188477158546448, 'eval_runtime': 201.3558, 'eval_samples_per_second': 0.497, 'eval_steps_per_second': 0.035, 'epoch': 3.0}


In [40]:
from transformers import pipeline

# Load the trained model and tokenizer
classifier = pipeline("text-classification", model='./fine-tuned-xlm-roberta-mlm', tokenizer='./fine-tuned-xlm-roberta-mlm')

# Predict the category of a new headline
headline = "Izikhungo zempilo zihlola izinkinga zezimali kubantu abahlala phansi kwezinkulu"
prediction = classifier(headline)

predicted_label_id = prediction[0]['label']
predicted_label_name = label_encoder.inverse_transform([int(predicted_label_id.split('_')[1])])[0]

# Display the prediction with the original label
print(f"Predicted Label: {predicted_label_name}, Confidence: {prediction[0]['score']}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted Label: ezezimoto, Confidence: 0.4014414846897125
