In [8]:
# %%
# Cell 1: Imports and Setup
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.optim as optim
import torch.nn as nn
import motor.motor_asyncio
import asyncio
import nest_asyncio
from collections import defaultdict
import pandas as pd
from datetime import datetime
import re
from textblob import TextBlob
import psutil

nest_asyncio.apply()
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 2: MongoDB Connection
try:
    MONGO_URI = "mongodb://localhost:27017/csv_validation_db"
    client = motor.motor_asyncio.AsyncIOMotorClient(MONGO_URI)
    db = client["csv_validation_db"]
    user_feedback_collection = db["user_feedback"]
    validation_logs_collection = db["validation_logs"]
    print("MongoDB connected successfully")
except Exception as e:
    print(f"MongoDB connection failed: {e}")
    feedback_questions, feedback_labels, monthly_counts, file_ids = [], [], defaultdict(int), set(["1", "2", "3"])
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 3: Synonym Mapping
SYNONYM_MAP = {
    "file": ["files", "document", "doc", "dataset", "spreadsheet"],
    "upload": ["uploaded", "uploading", "submit", "send", "provide"],
    "validate": ["check", "verify", "validate", "inspect", "examine"],
    "error": ["errors", "mistake", "issue", "problem", "bug", "anomaly", "anomalies"],
    "report": ["pdf", "document", "summary", "analysis"],
    "history": ["log", "record", "past", "previous"],
    "how": ["hw", "hows", "method", "way"],
    "many": ["much", "number of", "count", "quantity"],
    "status": ["state", "progress", "condition", "situation"],
    "generate": ["create", "make", "produce", "build"],
    "application": ["app", "tool", "software", "system", "platform"],
    "show": ["display", "list", "present", "reveal"],
    "id": ["number", "identifier", "code"],
}
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 4: Text Preprocessing Function
def preprocess_text(text: str) -> str:
    text = text.lower().strip()
    corrections = {
        "whn": "when", "wat": "what", "wen": "when", "hw": "how",
        "pls": "please", "plz": "please", "thx": "thanks",
        "u": "you", "r": "are", "yr": "your", "wht": "what",
        "cn": "can", "whr": "where", "hws": "how's",
        "errrs": "errors", "eror": "error", "mistak": "mistake"
    }
    for wrong, right in corrections.items():
        text = re.sub(r'\b' + wrong + r'\b', right, text)
    
    blob = TextBlob(text)
    corrected_text = str(blob.correct()) if len(text) < 50 else text
    
    number_pattern = re.compile(r'\b(\d+)\b')
    numbers = number_pattern.findall(corrected_text)
    number_map = {num: f"<NUM{idx}>" for idx, num in enumerate(numbers)}
    temp_text = corrected_text
    for num in numbers:
        temp_text = re.sub(r'\b' + num + r'\b', number_map[num], temp_text)
    
    words = temp_text.split()
    processed_words = []
    for word in words:
        if word.startswith('<NUM'):
            processed_words.append(word)
            continue
        word = re.sub(r'[^\w\s]', '', word)
        if word:
            replaced = False
            for canonical, synonyms in SYNONYM_MAP.items():
                if word in synonyms or word == canonical:
                    processed_words.append(canonical)
                    replaced = True
                    break
            if not replaced:
                processed_words.append(word)
    
    processed_text = " ".join(processed_words)
    for num, placeholder in number_map.items():
        processed_text = processed_text.replace(placeholder, num)
    
    return processed_text
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 5: Fetch Training Data
async def fetch_training_data():
    feedback = await user_feedback_collection.find().to_list(100)
    feedback_questions = []
    feedback_labels = []
    
    for f in feedback:
        if f.get("intent") is not None and "I didn't get that" not in f.get("response", ""):
            question = preprocess_text(f["question"])
            intent = f["intent"]
            if isinstance(intent, (int, float)) and intent in [0, 1, 2]:
                feedback_questions.append(question)
                feedback_labels.append(int(intent))
    
    logs = await validation_logs_collection.find().to_list(100)
    monthly_counts = defaultdict(int)
    file_ids = set()
    for log in logs:
        try:
            dt = datetime.fromisoformat(log["start_time"].replace('Z', '+00:00'))
            month_year = dt.strftime("%B %Y").lower()
            monthly_counts[month_year] += 1
            file_ids.add(str(log["unique_id"]))
        except:
            continue
    
    return feedback_questions, feedback_labels, monthly_counts, file_ids

try:
    loop = asyncio.get_event_loop()
    feedback_questions, feedback_labels, monthly_counts, file_ids = loop.run_until_complete(fetch_training_data())
    print(f"MongoDB Feedback: {len(feedback_questions)} questions, Labels: {set(feedback_labels)}")
except Exception as e:
    print(f"Data fetch failed: {e}")
    feedback_questions, feedback_labels, monthly_counts, file_ids = [], [], defaultdict(int), set(["1", "2", "3"])
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 6: Initialize DistilBERT Model
device = torch.device("cpu")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
bert_model.to(device)
print(f"Model loaded on {device}")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 7: Define Base Training Data
base_questions = [
    # Greetings (label 1)
    "hi", "hello", "hey", "good morning", "good afternoon", "good evening",
    "what's up", "howdy", "greetings", "hi there", "hello friend", "yo",
    "how are you", "how's it going", "how are things", "what's new", "sup",
    "hii", "helo", "hay", "gd mrng", "gd evng", "whats up", "how r u",
    "hola", "good day", "hey there", "hi buddy", "hello there", "what's good",
    "how you doing", "how's everything", "what's happening",
    
    # File-related (label 2)
    "how to upload a file", "can i submit a csv", "where to upload documents",
    "check my file errors", "validate my dataset", "inspect my spreadsheet",
    "how many files today", "count of uploads this week", "number of documents yesterday",
    "show file issues", "display validation problems", "get error details",
    "generate data report", "create validation summary", "export analysis pdf",
    "what's my upload status", "current validation state", "file processing progress",
    "correct my file mistakes", "fix data errors", "resolve validation issues",
    "list all uploaded files", "errors of file 6", "status of file 6", "erors of file 6",
    "file 6 status", "check status of file 6", "errors of file 1", "status of file 2",
    "file 3 errors", "check file 4 status", "validation issues file 5",
    
    # General (label 0)
    "what does this app do", "purpose of this tool", "how this system works",
    "is my data secure", "where is my information stored", "data privacy policy",
    "supported file types", "acceptable formats", "what documents can i upload",
    "how to start validation", "begin checking process", "initiate inspection",
    "where to see results", "location of reports", "how to download outputs",
    "what is this platform", "explain the software", "tool functionality",
    "how does this app function", "what can this system do", "describe the platform",
    "security of my data", "where are reports saved", "how to get started",
    "what file formats work", "how to use this tool", "app overview",
    "system requirements", "how to view validation results", "data storage info"
]
base_labels = [1]*33 + [2]*33 + [0]*30
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 8: Define File-Specific Questions
file_specific_questions = [
    # File error questions
    "what are the errors in file 6", "show me errors for file 6", "display errors in document 6",
    "list mistakes in file 6", "what went wrong with file 6", "errors for file id 6",
    "problems with file number 6", "validation issues in file 6", "error details for file 6",
    "show issues in file 6", "file 6 error report", "what errors in file 6",
    "erors of file 6", "error of file 6", "file 6 errors",
    
    # Status questions
    "status of file 6", "what is the status of file 6", "file 6 status",
    "check status of file 6", "current state of file 6", "progress of file 6",
    "validation status for file 6", "status file 6", "file 6 progress",
    
    # General error questions
    "total number of errors", "how many errors in my file", "count of validation errors",
    "number of mistakes found", "total errors in document", "how many issues were detected",
    "error count", "summarize errors", "total anomalies in files",
    
    # Specific error types
    "missing values in file 6", "duplicates in file 6", "format errors in file 6",
    "anomalies in file 6", "mandatory field errors in file 6", "data type issues in file 6",
    
    # Other file IDs
    "errors in file 1", "show errors for file 2", "errors of file 3",
    "status of file 4", "problems in file 5", "error details for file 7",
    "status of file 8", "errors in file 10", "status of file 1", "errors of file 2"
]
file_specific_labels = [2] * len(file_specific_questions)
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 9: Define Dynamic File Questions
file_questions = []
for month, count in monthly_counts.items():
    file_questions.extend([
        f"how many files in {month}", f"files uploaded in {month}",
        f"file count in {month}", f"uploads in {month}"
    ])
for file_id in list(file_ids)[:5]:
    file_questions.extend([
        f"what are the errors in file {file_id}", f"show errors for file {file_id}",
        f"errors of file {file_id}", f"status of file {file_id}",
        f"check status of file {file_id}"
    ])
file_labels = [2] * len(file_questions)
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 10: Combine and Validate Training Data
all_questions = base_questions + file_specific_questions + file_questions + feedback_questions
all_labels = base_labels + file_specific_labels + file_labels + feedback_labels

all_labels = [int(label) for label in all_labels if label in [0, 1, 2]]
if len(all_questions) != len(all_labels):
    all_questions = all_questions[:len(all_labels)]
print(f"Total training examples: {len(all_questions)}")
print(f"Greeting examples: {sum(1 for x in all_labels if x == 1)}")
print(f"File-related examples: {sum(1 for x in all_labels if x == 2)}")
print(f"General examples: {sum(1 for x in all_labels if x == 0)}")
print(f"Unique labels: {set(all_labels)}")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 11: Balance and Augment Dataset
label_counts = np.bincount(all_labels)
max_count = 100
balanced_questions = []
balanced_labels = []

for label in range(3):
    indices = [i for i, l in enumerate(all_labels) if l == label]
    questions = [all_questions[i] for i in indices]
    labels = [all_labels[i] for i in indices]
    repeat_factor = max_count // len(questions) if len(questions) > 0 else 1
    balanced_questions.extend(questions * min(repeat_factor, 3))
    balanced_labels.extend(labels * min(repeat_factor, 3))

augmented_questions = balanced_questions.copy()
augmented_labels = balanced_labels.copy()
for q, l in zip(balanced_questions, balanced_labels):
    if l == 2 and "file" in q:
        augmented_questions.extend([q.replace("file", "document"), q.replace("file", "dataset")])
        augmented_labels.extend([l, l])
    if l == 2 and "error" in q:
        augmented_questions.extend([q.replace("error", "issue"), q.replace("error", "problem")])
        augmented_labels.extend([l, l])
    if l == 2 and "status" in q:
        augmented_questions.extend([q.replace("status", "state"), q.replace("status", "progress")])
        augmented_labels.extend([l, l])
    if l == 1:
        augmented_questions.extend([q + "!", q.replace("hi", "hey"), q.replace("hello", "hola")])
        augmented_labels.extend([l, l, l])
    if l == 0:
        augmented_questions.extend([q.replace("app", "tool"), q.replace("system", "platform")])
        augmented_labels.extend([l, l])
print(f"Augmented examples: {len(augmented_questions)}")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 12: Prepare BERT Data
inputs = tokenizer(augmented_questions, padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = inputs['input_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
labels = np.array(augmented_labels)
print(f"Unique labels in dataset: {set(labels)}")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 13: Split Data
train_idx, test_idx = train_test_split(
    np.arange(len(labels)), 
    test_size=0.2, 
    random_state=42,
    stratify=labels
)

train_dataset = {
    'input_ids': input_ids[train_idx],
    'attention_mask': attention_mask[train_idx],
    'labels': torch.tensor(labels[train_idx])
}

test_dataset = {
    'input_ids': input_ids[test_idx],
    'attention_mask': attention_mask[test_idx],
    'labels': torch.tensor(labels[test_idx])
}
print(f"Train labels: {set(train_dataset['labels'].numpy())}")
print(f"Test labels: {set(test_dataset['labels'].numpy())}")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 14: Training Loop with Early Stopping
optimizer = optim.AdamW(bert_model.parameters(), lr=2e-5)  # Reverted to stable LR
loss_fn = nn.CrossEntropyLoss()
best_f1 = 0
patience = 5  # Increased patience
no_improve = 0

batch_size = 16  # Added batching
num_batches = len(train_dataset['input_ids']) // batch_size + 1

bert_model.train()
for epoch in range(20):  # Increased epochs
    total_loss = 0
    for i in range(0, len(train_dataset['input_ids']), batch_size):
        batch_ids = train_dataset['input_ids'][i:i+batch_size]
        batch_mask = train_dataset['attention_mask'][i:i+batch_size]
        batch_labels = train_dataset['labels'][i:i+batch_size]
        
        optimizer.zero_grad()
        outputs = bert_model(
            input_ids=torch.tensor(batch_ids).to(device),
            attention_mask=torch.tensor(batch_mask).to(device),
            labels=batch_labels.to(device)
        )
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / num_batches
    
    bert_model.eval()
    with torch.no_grad():
        test_outputs = bert_model(
            input_ids=torch.tensor(test_dataset['input_ids']).to(device),
            attention_mask=torch.tensor(test_dataset['attention_mask']).to(device),
            labels=test_dataset['labels'].to(device)
        )
        test_loss = test_outputs.loss.item()
        predictions = torch.argmax(test_outputs.logits, dim=1)
        accuracy = accuracy_score(test_dataset['labels'], predictions)
        report = classification_report(
            test_dataset['labels'],
            predictions,
            labels=[0, 1, 2],
            target_names=["General", "Greeting", "File-Related"],
            zero_division=0,
            output_dict=True
        )
        avg_f1 = (report["General"]["f1-score"] + report["Greeting"]["f1-score"] + report["File-Related"]["f1-score"]) / 3
        
    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {avg_train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Average F1: {avg_f1:.4f}")
    print(classification_report(
        test_dataset['labels'],
        predictions,
        labels=[0, 1, 2],
        target_names=["General", "Greeting", "File-Related"],
        zero_division=0
    ))
    
    if avg_f1 > best_f1:
        best_f1 = avg_f1
        no_improve = 0
        torch.save(bert_model.state_dict(), "best_bert_intent_model.pt")
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping triggered")
            break
    
    bert_model.train()
    print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 15: Test Model Locally
bert_model.load_state_dict(torch.load("best_bert_intent_model.pt"))
bert_model.eval()
test_queries = [
    "errors of file 6", "status of file 6", "hello", "what does this app do",
    "erors of file 6", "file 6 status", "hi", "how this system works",
    "hey", "validate my dataset", "is my data secure"
]
for query in test_queries:
    inputs = tokenizer(preprocess_text(query), return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(
            input_ids=inputs['input_ids'].to(device),
            attention_mask=inputs['attention_mask'].to(device)
        )
    intent = torch.argmax(outputs.logits, dim=1).item()
    intent_name = ["General", "Greeting", "File-Related"][intent]
    print(f"Query: {query} -> Intent: {intent_name} (Label: {intent})")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

# %%
# Cell 16: Save Model
print("XGBoost training skipped; using DistilBERT model only.")
torch.save(bert_model.state_dict(), "bert_intent_model.pt")
print("Model saved successfully:")
print(f"- DistilBERT model: bert_intent_model.pt ({os.path.getsize('bert_intent_model.pt')/1024:.1f} KB)")
print(f"Memory usage: {psutil.Process().memory_info().rss / 1024**2:.2f} MB")

Memory usage: 1011.15 MB
MongoDB connected successfully
Memory usage: 1013.69 MB
Memory usage: 1013.70 MB
Memory usage: 1013.73 MB
MongoDB Feedback: 68 questions, Labels: {0, 1, 2}
Memory usage: 1018.11 MB


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on cpu
Memory usage: 992.72 MB
Memory usage: 992.73 MB
Memory usage: 992.73 MB
Memory usage: 992.75 MB
Total training examples: 249
Greeting examples: 38
File-related examples: 128
General examples: 84
Unique labels: {0, 1, 2}
Memory usage: 992.78 MB


IndexError: list index out of range