In [None]:
import pandas as pd
import re
from datasets import Dataset
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/final_dataset.csv')  # Replace with your dataset path

# Preprocessing function
def preprocess_text(text):
    """
    Clean and preprocess the text data.
    - Lowercases the text
    - Removes URLs, special characters, and extra whitespace
    - Removes stopwords
    """
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    words = text.split()
    text = " ".join(word for word in words if word not in stop_words)  # Remove stopwords
    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

# Apply preprocessing
data['text'] = data['text'].astype(str).apply(preprocess_text)

# Drop duplicates and handle missing values
data.drop_duplicates(subset='text', inplace=True)
data.dropna(subset=['text', 'label'], inplace=True)

# Map labels to integers
label_map = {'chit-chat': 0, 'topic-specific': 1}
data['label'] = data['label'].map(label_map)

# Limit "chit-chat" data to 70,000 samples
chitchat_data = data[data['label'] == label_map['chit-chat']].sample(n=70000, random_state=42)
topic_data = data[data['label'] == label_map['topic-specific']]

# Combine the datasets and shuffle
limited_data = pd.concat([chitchat_data, topic_data]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train and validation datasets
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    limited_data['text'], limited_data['label'], test_size=0.2, stratify=limited_data['label'], random_state=42
)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_dataset = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})

# Save the datasets for reuse
train_dataset.save_to_disk("/content/drive/MyDrive/Colab Notebooks/train_dataset")
val_dataset.save_to_disk("/content/drive/MyDrive/Colab Notebooks/val_dataset")

print("Train and validation datasets created with limited 'chit-chat' data, preprocessing, and stopword removal.")


In [None]:
import pandas as pd
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch


# Load the tokenizer
model_name = "albert-base-v2"  # Use ALBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize datasets
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Metrics calculation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Training arguments with regularization and early stopping
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,  # Regularization term
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    seed=42,
    fp16=True,  # Enable mixed-precision training
)

# Initialize Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],  # Stop after 3 epochs with no improvement
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save the model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/albert_text_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/albert_text_classifier")


Map:   0%|          | 0/113600 [00:00<?, ? examples/s]

Map:   0%|          | 0/28401 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0002,0.010371,0.998134,0.99816,0.998333,0.997986
2,0.0001,0.006189,0.998768,0.998785,0.998612,0.998958
3,0.0,0.006659,0.998803,0.998819,0.999166,0.998472


Evaluation Results: {'eval_loss': 0.006188678089529276, 'eval_accuracy': 0.9987676490264428, 'eval_f1': 0.9987850175304613, 'eval_precision': 0.9986116895737887, 'eval_recall': 0.9989584056662731, 'eval_runtime': 92.8062, 'eval_samples_per_second': 306.025, 'eval_steps_per_second': 19.137, 'epoch': 3.0}


('/content/drive/MyDrive/Colab Notebooks/albert_text_classifier/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/albert_text_classifier/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/albert_text_classifier/spiece.model',
 '/content/drive/MyDrive/Colab Notebooks/albert_text_classifier/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/albert_text_classifier/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the saved model and tokenizer
model_path = "/content/drive/MyDrive/Colab Notebooks/albert_text_classifier"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def classify_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Get predicted label based on adjusted threshold (for example, 0.55)
    predicted_label = torch.argmax(probabilities, dim=-1).item()

    # Get the probability of the predicted label
    predicted_probability = probabilities[0][predicted_label].item()

    # Optionally check if the confidence is low and make adjustments
    if predicted_probability < 0.55:  # Change 0.55 to a higher value if needed
        predicted_label = 1  # For example, force "topic-specific" if low confidence

    # Map label to corresponding text
    label_map = {0: 'Chitchat', 1: 'Topic-specific'}
    return label_map[predicted_label]



# Chat terminal loop
print("Chat terminal is running. Type 'exit' to stop.")
while True:
    user_input = input("You: ")

    # Exit condition
    if user_input.lower() == 'exit':
        print("Exiting chat terminal...")
        break

    # Classify the input text
    prediction = classify_text(user_input)

    # Print the result
    print(f"Bot: {prediction}")


Chat terminal is running. Type 'exit' to stop.
You: what is GHI and how it  attempts to measure and track hunger globally
Bot: Chitchat
You: Global Hunger Index GHI is a tool that attempts to measure and track hunger globally 
Bot: Topic-specific
You: GlobalGHI is a tool that attempts to measure and track hunger globally 
Bot: Chitchat
You: GHI is a tool that attempts to measure and track hunger globally
Bot: Chitchat
You: Bechuanaland National Airways was the national airline of the Bechuanaland Protectorate and was based in Francistown
Bot: Topic-specific
You: After independence Jawaharlal Nehru the first prime minister of India initiated reforms to promote higher education and science and technology in India The Indian Institute of Technology IIT conceived by a 22 member committee of scholars and entrepreneurs in order to promote technical education was inaugurated on 18 August 1951 at Kharagpur in West Bengal by the minister of education Maulana Abul Kalam Azad More IITs were soon 

In [None]:
from datasets import load_from_disk

# Step 1: Load the datasets
train_dataset = load_from_disk("/content/drive/MyDrive/Colab Notebooks/train_dataset")
val_dataset = load_from_disk("/content/drive/MyDrive/Colab Notebooks/val_dataset")

from collections import Counter
import re

def extract_common_phrases(dataset, label=1, n_phrases=10):
    """
    Extract common phrases from the dataset.
    :param dataset: The dataset containing 'text' and 'label'.
    :param label: The label corresponding to topic-specific texts.
    :param n_phrases: Number of common phrases to extract.
    :return: List of common phrases.
    """
    # Filter dataset for topic-specific texts
    topic_texts = [item['text'] for item in dataset if item['label'] == label]

    # Tokenize and clean text
    phrases = []
    for text in topic_texts:
        # Remove non-alphanumeric characters and tokenize
        tokens = re.findall(r'\b\w+\b', text.lower())
        phrases.extend([' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)])  # Extract 3-grams

    # Count most common phrases
    common_phrases = Counter(phrases).most_common(n_phrases)
    return [phrase[0] for phrase in common_phrases]

# Extract common phrases
common_phrases = extract_common_phrases(train_dataset, label=1, n_phrases=20)
print("Common Phrases:", common_phrases)

from sklearn.feature_extraction.text import TfidfVectorizer

def generate_templates_tfidf(dataset, label=1, n_templates=10):
    """
    Generate topic-specific templates using TF-IDF.
    :param dataset: The dataset containing 'text' and 'label'.
    :param label: The label corresponding to topic-specific texts.
    :param n_templates: Number of templates to generate.
    :return: List of generated templates.
    """
    # Filter dataset for topic-specific texts
    topic_texts = [item['text'] for item in dataset if item['label'] == label]

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')  # Use bigrams and trigrams
    tfidf_matrix = vectorizer.fit_transform(topic_texts)

    # Extract top phrases
    scores = zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0])
    sorted_phrases = sorted(scores, key=lambda x: x[1], reverse=True)
    return [phrase for phrase, score in sorted_phrases[:n_templates]]

# Extract TF-IDF phrases
tfidf_phrases = generate_templates_tfidf(train_dataset, label=1, n_templates=20)
print("TF-IDF Phrases:", tfidf_phrases)

topic_templates = list(set(common_phrases + tfidf_phrases))
print("Combined Topic Templates:", topic_templates)

Common Phrases: ['national wildlife refuge', 'men basketball tournament', 'automatic bid ncaa', 'register historic places', 'national register historic', 'new york city', 'united states house', 'states house representatives', 'listed national register', 'new south wales', 'conference automatic bid', 'bid ncaa division', 'park national park', 'km sq mi', 'ice hockey tournament', 'world war ii', 'house representatives elections', 'men ice hockey', 'national park national', 'bid ncaa tournament']
TF-IDF Phrases: ['united states', 'national park', 'new york', 'basketball tournament', 'elections held', 'high school', 'tournament held', 'house representatives', 'nature reserve', 'ncaa division', 'men basketball', 'science fiction', 'science technology', 'took place', 'higher education', 'men basketball tournament', 'automatic bid', 'bid ncaa', 'united kingdom', 'automatic bid ncaa']
Combined Topic Templates: ['united states house', 'higher education', 'listed national register', 'national re

In [None]:
import pickle

with open("/content/drive/MyDrive/Colab Notebooks/template_embeddings.pkl", "wb") as f:
    pickle.dump(template_embeddings, f)

sentence_model.save("/content/drive/MyDrive/Colab Notebooks/sentence_transformer_model")


In [None]:
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Paths
save_dir = "/content/drive/MyDrive/Colab Notebooks/albert_text_classifier"
embeddings_path = "/content/drive/MyDrive/Colab Notebooks/template_embeddings.pkl"
sentence_transformer_path = "/content/drive/MyDrive/Colab Notebooks/sentence_transformer_model"

# Step 1: Load ALBERT model and tokenizer
albert_model = AutoModelForSequenceClassification.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

# Step 2: Load SentenceTransformer model
sentence_model = SentenceTransformer(sentence_transformer_path)

# Step 3: Load template embeddings
with open(embeddings_path, "rb") as f:
    template_embeddings = pickle.load(f)

# Ensemble classification function
def ensemble_classifys(text, albert_model, tokenizer, sentence_model, template_embeddings, threshold=0.7):
    """
    Ensemble model combining cosine similarity and ALBERT classification.
    :param text: Input text for classification.
    :param albert_model: Trained ALBERT model.
    :param tokenizer: Tokenizer for the ALBERT model.
    :param sentence_model: Pre-trained SentenceTransformer model.
    :param template_embeddings: Precomputed template embeddings.
    :param threshold: Cosine similarity threshold for topic-specific classification.
    :return: Predicted label ('Chitchat' or 'Topic-specific').
    """
    # Cosine similarity-based classification
    text_embedding = sentence_model.encode([text])[0]
    similarities = cosine_similarity([text_embedding], template_embeddings).flatten()
    max_similarity = max(similarities)

    if max_similarity > threshold:
        return "Topic-specific"

    # ALBERT classification
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = albert_model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
    predicted_label = "Topic-specific" if probs[1] > probs[0] else "Chitchat"

    return predicted_label

# Example Usage
input_text = "Explain the greenhouse effect."
result = ensemble_classifys(input_text, albert_model, tokenizer, sentence_model, template_embeddings)
print(f"Classification Result: {result}")


Classification Result: Chitchat


In [None]:
while True:
  # Example usage
  user_input = input("User: ")
  if user_input.lower() == 'exit':
    print("GoodBye!")
    break
  user_input = preprocess_text(user_input)
  prediction = ensemble_classifys(input_text, albert_model, tokenizer, sentence_model, template_embeddings)
  # prediction = ensemble_classifying(user_input)
  print(f"Label: {prediction}")

User: when was world war ii officially started?
Label: Chitchat
User: exit
GoodBye!


In [None]:
while True:
  # Example usage
  user_input = input("User: ")
  if user_input.lower() == 'exit':
    print("GoodBye!")
    break
  user_input = preprocess_text(user_input)
  prediction = ensemble_classify(user_input, model, tokenizer)
  print(f"Label: {prediction}")

User: when was world war ii officially started?
Label: Topic-specific
User: when is the election for united states house of reps held?
Label: Topic-specific
User: hi how are you?
Label: Chitchat
User: oh
Label: Chitchat
User: how was your day? mine was very great and really good weather too!
Label: Chitchat
User: exit
GoodBye!


In [None]:
val_phrases_flat

['professional wrestling promotion based halifax nova scotia',
 'professional wrestlers managers',
 'play color commentators',
 'interviewers referees',
 'quivira national wildlife refuge south central kansas united states',
 'rare inland marshes',
 'town stafford',
 'mostly northeastern stafford county small parts',
 'southwestern rice northwestern reno counties',
 'central flyway migration route salt',
 'refuge',
 'combine endow refuge',
 'large variety birds',
 'many birds uncommon parts',
 'even central part continent january quivira nwr cheyenne bottoms wildlife area',
 'one wonders',
 'kansas quivira',
 'nwr one places united states ramsar list',
 'wetland international importance',
 'fortune small business fsb american magazine',
 'times',
 'year',
 'inc american',
 'small business services',
 'million small business owners',
 'united states',
 'cea list laboratory integration systems technology french laboratoire int gration de syst mes et des technologies',
 'one three institu

In [None]:
train_phrases

[['cherno sports complex football training ground',
  'asparuhovo district varna bulgaria cherno sports complex home ground cherno ii academy cherno stadium located asparuhov bridge capacity spectators stadium',
  'korabostroitel',
  'cherno sports complex',
  'one main pitch',
  'also junior training matches',
  'one training pitch synthetic grass',
  'junior team training matches'],
 ['nepal',
  'part guna group nepalese conglomerate companies',
  'operations',
  'operations simrik airlines airline',
  'operations',
  'operations',
  'back initial airline company'],
 ['batang',
  'national park malay taman negara batang',
  'national park located sri aman division',
  'malaysia located lubok',
  'kilometers east kuching',
  'park',
  'area square kilometres',
  'mi',
  'extensive tropical rainforest number rare protected animals',
  'square kilometer artificial lake',
  'created batang',
  'hydroelectric reservoir park',
  'increasingly popular locals tourists',
  'boat traditional b