In [None]:
!pip install transformers datasets langchain sentence_transformers

In [None]:
!pip install -U langchain-community

In [None]:
!pip install ray[tune]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

mport necessary libraries:

*   **`pandas`**: For data manipulation.
*   **`transformers`**: For using pre-trained models.
*   **`torch`**: For tensor operations.
*   **`sklearn.model_selection`**: For splitting data.
* **`ray.tune`**: For hyperparameter tuning.

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, EarlyStoppingCallback
import transformers
import torch
from sklearn.model_selection import train_test_split
from ray import tune

Load the dataset from the specified CSV file into a pandas DataFrame.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cleanerData.csv')
df

Rename specific columns for better readability and consistency.
* The `Section` column is renamed to `Title`.
* The `Paragraph` column is renamed to `Content`.

In [None]:
df.rename(columns={'Section': 'Title'}, inplace=True)

df.rename(columns={'Paragraph': 'Content'}, inplace=True)

Concatenate `Title` and `Content` into a new `text` column, and remove rows with missing text data.

In [None]:
df['text'] = df['Title'] + ' ' + df['Content']
df.dropna(subset=['text'], inplace=True)

In [None]:
import re

Convert all text to lowercase and remove leading/trailing whitespaces.

In [None]:
df['text'] = df['text'].apply(lambda text: text.lower().strip())

In [None]:
df

# Model Initialization and Pipeline Setup

This section initializes a pre-trained BERT model, sets up a text classification pipeline, and applies the model to predict classes for the text data.

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(df['Content'].unique()))

Load the tokenizer and a pre-trained BERT model for sequence classification.

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1, truncation=True, max_length=512)

Create a text classification pipeline using the loaded model and tokenizer.

*   Utilize GPU if available (`device=0`), else use CPU (`device=-1`).
*   Set `truncation=True` and `max_length=512` to handle long sequences.

In [None]:
def classify_text(text):
    result = classifier(text)[0]
    return result['label']

df['predicted_class'] = df['text'].apply(classify_text)

print(df[['text', 'predicted_class']])

List the unique labels in the predicted classes.

In [None]:
df['predicted_class'].unique()

In [None]:
print(len(df['predicted_class'].unique()))

# Data Splitting and Tokenization

This section prepares the data for training by converting labels to numerical IDs, splitting the data into training and validation sets, and tokenizing the text data.

In [None]:
all_labels = df['predicted_class'].tolist()
unique_labels = pd.Series(all_labels).unique()
label_mapping = {label: i for i, label in enumerate(unique_labels)}

def convert_labels_to_ids(labels):
    return [label_mapping[label] for label in labels]

all_texts = df['text'].tolist()
all_labels_numerical = convert_labels_to_ids(all_labels)

In [None]:
df

Split the data into training and validation sets (80/20 split).

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels_numerical, test_size=0.2, random_state=42
)

In [None]:
import datasets

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = datasets.Dataset.from_dict({"text": train_texts, "label": train_labels})
train_dataset = train_dataset.map(tokenize_function, batched=True)

val_dataset = datasets.Dataset.from_dict({"text": val_texts, "label": val_labels})
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Model Training

This section defines the model, training arguments, and trainer. The model is then trained on the preprocessed training data.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(unique_labels)
)

training_args = TrainingArguments(
    output_dir="./results",         
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=64,  
    num_train_epochs=4,              
    learning_rate=2e-5,           
    weight_decay=0.01,
    logging_steps=10,

    metric_for_best_model="eval_loss",  
    greater_is_better=False, 
    evaluation_strategy="epoch",     
    save_strategy="epoch",           
    load_best_model_at_end=True,   
    push_to_hub=False,             
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

# Conclusion

The model training process has been completed, with the following results achieved after 4 epochs:

*   **Global Steps:** 236
*   **Training Loss:** 0.4968
*   **Training Time:** 232.36 seconds
*   **Samples per Second:** 8.125
*   **Steps per Second:** 1.016
*   **Total FLOPS:** 496,775,973,273,600
* **Epoch:** 4.0

**Analysis:**

The model's training concluded with a final training loss of approximately `0.4968`. This value indicates the average difference between the model's predictions and the actual labels during the training phase. A loss of `0.4968`, while not extremely low, is indicative of a model that has learned to some degree.
The model has processed an average of 8.125 samples per second, showing a moderate speed of processing. The rate of 1.016 training steps per second indicates that the model is progressing at a steady pace. The training time of 232.36 seconds was used to complete the 4 training epochs.
The total `FLOPS` (Floating Point Operations) represent the computational load involved in the training process.

**Implications for the Immigration Chatbot:**

Given these results, the text classification model for the immigration chatbot has shown some promising learning, achieving a loss that demonstrates it has learned meaningful patterns in the data. The model is working properly.

**Conclusion:**
In conclusion, the training phase for the immigration chatbot's text classification model was successful. The model shows potential, but further evaluation and fine-tuning are required to ensure its reliability in real-world scenarios.