<a href="https://colab.research.google.com/github/SasidharTA/NLP-Gen-AI-classroom/blob/main/Assignment-4/Colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments
from transformers.trainer_utils import IntervalStrategy, SaveStrategy
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer
import evaluate
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
import mlflow
import json

In [4]:
%pwd

'/content'

In [2]:
%cd /content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4/

/content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4


In [13]:
!pip install -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt

Collecting llama_index (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt (line 3))
  Downloading llama_index-0.14.8-py3-none-any.whl.metadata (13 kB)
Collecting rank_bm25 (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt (line 5))
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting faiss-cpu (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt (line 7))
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting langchain-community (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt (line 8))
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pymupdf (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements.txt (line 9))
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting chromadb (from -r /content/drive/MyDrive/NLP-Gen-AI-classroom/requirements

In [3]:
MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_SEQ_LENGTH = 1024
LORA_R = 16
LORA_ALPHA = 32
NUM_TRAIN_EPOCHS = 3
LR = 2e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Initialize MLflow Tracking
# We use a local tracking URI for Colab
MLFLOW_TRACKING_URI = "/content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4/mlruns"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("IT-Giant_Table-to-Insights")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Using device: cuda


  return FileStore(store_uri, store_uri)


<Experiment: artifact_location='/content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4/mlruns/219980271548056433', creation_time=1762844457572, experiment_id='219980271548056433', last_update_time=1762844457572, lifecycle_stage='active', name='IT-Giant_Table-to-Insights', tags={}>

In [4]:
FILE_PATH = "Data/table_insights_labeled_data.xlsx"
df_raw = pd.read_excel(FILE_PATH)

# Clean Column Names and Create Group IDs
new_columns = list(df_raw.columns)
new_columns[0] = 'device_log_group'
df_raw.columns = new_columns
df_raw.rename(columns={'serialnumber_org': 'device_serial'}, inplace=True)
df_raw['group_id'] = df_raw['device_log_group'].ffill() + '_' + df_raw['device_serial'].astype(str)

# Serialization Function (As defined previously)
def group_and_serialize_logs(group):
    target_row = group[(group['insight_0'] != '0') & (group['insight_0'] != 0)].head(1)
    if target_row.empty:
        return pd.Series({'input_prompt': None, 'target_insight': None})

    target_insight = target_row['insight_0'].iloc[0]
    kpi_cols = ['cpu_usage', 'ram_usage', 'diskio_usage']
    agg_data = group[kpi_cols].agg(['mean', 'max', 'count']).to_dict()
    null_counts = group[kpi_cols].isnull().sum().to_dict()

    prompt_template = f"""
    Analyze the following machine usage log data for Device: {group['device_serial'].iloc[0]}.
    The data spans {agg_data['cpu_usage']['count']} data points.

    ### Key Performance Indicators (KPIs):
    - CPU Usage (Mean/Max): {agg_data['cpu_usage']['mean']:.2f}% / {agg_data['cpu_usage']['max']:.2f}%
    - RAM Usage (Mean/Max): {agg_data['ram_usage']['mean']:.2f}% / {agg_data['ram_usage']['max']:.2f}%

    ### Data Completeness:
    - Disk IO Usage: {agg_data['diskio_usage']['count']} non-null entries.
    - Missing CPU entries: {null_counts['cpu_usage']}

    TASK: Generate a concise, actionable business insight (similar to the provided GPT insight_0) based on this summary.
    """
    return pd.Series({'input_prompt': prompt_template.strip(), 'target_insight': target_insight})

# Apply Grouping
df_processed = df_raw.groupby('group_id').apply(group_and_serialize_logs).reset_index()
df_processed.dropna(subset=['target_insight'], inplace=True)
df_processed.rename(columns={'input_prompt': 'text', 'target_insight': 'summary'}, inplace=True)

# Train/Test Split
train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
test_df_for_eval = test_df.copy()

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[['text', 'summary']])
test_dataset = Dataset.from_pandas(test_df_for_eval[['text', 'summary']])
print(train_dataset.column_names)

['text', 'summary', '__index_level_0__']


  df_processed = df_raw.groupby('group_id').apply(group_and_serialize_logs).reset_index()


In [5]:
#BASE MODEL INFERENCE AND BASELINE LOGGING


with mlflow.start_run(run_name="Base_Model_ZeroShot") as run:
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("tuning_method", "None (Zero-Shot)")
    mlflow.log_param("max_seq_length", MAX_SEQ_LENGTH)


    base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
    base_model.to(DEVICE)

    def generate_insight(prompt_text, model, tokenizer, max_length=150):
        input_text = f"Summarize machine usage logs into actionable business insights. INPUT: {prompt_text}"
        inputs = tokenizer(input_text, return_tensors="pt", max_length=MAX_SEQ_LENGTH, truncation=True).to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, num_beams=4, do_sample=False, early_stopping=True)
        return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Generate Predictions
    base_model_predictions = [generate_insight(prompt, base_model, tokenizer) for prompt in test_df_for_eval['text'].tolist()]
    test_df_for_eval['base_prediction'] = base_model_predictions
    references = test_df_for_eval['summary'].tolist()

    # Evaluation
    rouge_metric = evaluate.load("rouge")
    bleu_metric = evaluate.load("bleu")
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

    def calculate_semantic_match(predictions, references, model):
        pred_embeddings = model.encode(predictions, convert_to_tensor=True)
        ref_embeddings = model.encode(references, convert_to_tensor=True)
        cos_scores = util.cos_sim(pred_embeddings, ref_embeddings)
        return np.mean([cos_scores[i, i].item() for i in range(len(predictions))])

    base_rouge = rouge_metric.compute(predictions=base_model_predictions, references=references)
    base_bleu = bleu_metric.compute(predictions=base_model_predictions, references=references, max_order=4)
    base_semantic = calculate_semantic_match(base_model_predictions, references, semantic_model)

    # Log Metrics to MLflow
    mlflow.log_metric("base_bleu_4", round(base_bleu['bleu'], 4))
    mlflow.log_metric("base_rouge_l", round(base_rouge['rougeL'], 4))
    mlflow.log_metric("base_semantic_match", round(base_semantic, 4))
    print("Base Model Metrics Logged to MLflow.")

    # Save the base model results for comparison
    test_df_for_eval[['text', 'summary', 'base_prediction']].to_csv("base_model_predictions.csv", index=False)
    mlflow.log_artifact("base_model_predictions.csv")

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Base Model Metrics Logged to MLflow.


In [6]:
#LORA FINE-TUNING AND MLFLOW LOGGING


with mlflow.start_run(run_name="LoRA_FineTuned_FlanT5") as run:
    # 4.1 Log Parameters
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("tuning_method", "LoRA PEFT")
    mlflow.log_param("lora_r", LORA_R)
    mlflow.log_param("lora_alpha", LORA_ALPHA)
    mlflow.log_param("num_epochs", NUM_TRAIN_EPOCHS)
    mlflow.log_param("learning_rate", LR)

    def _fix_json_serialization(obj):
        if isinstance(obj, set):
            return list(obj)
        if isinstance(obj, tuple):
            return list(obj)
        raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

    # 4.2 Data Formatting for SFT
    def format_for_sft(example):
        prompt = f"Summarize machine usage logs into actionable business insights. INPUT: {example['text']}"
        target = example['summary']
        return {"text": f"{prompt} TARGET: {target}{tokenizer.eos_token}"}

    cols_to_remove = ['summary', '__index_level_0__']
    train_cols = train_dataset.column_names
    test_cols = test_dataset.column_names

    # Filter the list to only include existing columns
    cols_to_remove_train = [col for col in cols_to_remove if col in train_cols]
    cols_to_remove_test = [col for col in cols_to_remove if col in test_cols]

    # Apply the map function using the filtered list
    print(f"Columns to remove from train_dataset: {cols_to_remove_train}")
    print(f"Columns to remove from test_dataset: {cols_to_remove_test}")

    train_dataset = train_dataset.map(format_for_sft, remove_columns=['summary', '__index_level_0__'])
    test_dataset = test_dataset.map(format_for_sft, remove_columns=['summary', '__index_level_0__'])

    # 4.3 Configure and Apply LoRA
    lora_config = LoraConfig(
        r=LORA_R, lora_alpha=LORA_ALPHA, target_modules=["q", "v"],
        lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM",
    )
    # Log the full config as a JSON artifact
    with open("lora_config.json", "w") as f:
        json.dump(lora_config.to_dict(), f,default=_fix_json_serialization)
    mlflow.log_artifact("lora_config.json")

    model_for_tuning = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
    model_for_tuning = get_peft_model(model_for_tuning, lora_config)

    # 4.4 Training Arguments and SFTTrainer
    training_args = TrainingArguments(
    output_dir="./lora_results",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    learning_rate=LR,
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    save_steps=100,
    fp16=True,
    load_best_model_at_end=True,
    report_to="mlflow"
)

    trainer = SFTTrainer(
        model=model_for_tuning,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        processing_class=tokenizer
    )

    print("\n--- Starting LoRA Fine-Tuning ---")
    trainer.train()

    # 4.5 Save Adapter and Log Model
    ADAPTER_PATH = "flan_t5_table_insights_adapter"
    trainer.model.save_pretrained(ADAPTER_PATH)
    tokenizer.save_pretrained(ADAPTER_PATH)

    # Log the adapter weights and tokenizer to MLflow
    mlflow.log_artifact(ADAPTER_PATH)
    print("Fine-Tuned Adapter Logged to MLflow Artifacts.")

Columns to remove from train_dataset: ['summary', '__index_level_0__']
Columns to remove from test_dataset: ['summary', '__index_level_0__']


Map:   0%|          | 0/172 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/172 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/172 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/172 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/43 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.



--- Starting LoRA Fine-Tuning ---


Step,Training Loss,Validation Loss


Fine-Tuned Adapter Logged to MLflow Artifacts.


In [14]:
!lt --port 5000 --password mlflow

/bin/bash: line 1: lt: command not found


In [17]:
!pkill -f 'mlflow'
!pkill -f 'lt'

In [20]:
import time
import subprocess
from pyngrok import ngrok

MLFLOW_PORT = 5000

# Start MLflow UI reliably
mlflow_process = subprocess.Popen(
    ['mlflow', 'ui', '--host', '0.0.0.0', '--port', str(MLFLOW_PORT)],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

print("Waiting 10 seconds for the MLflow server to fully start...")
time.sleep(10)

try:
    ngrok.kill() # Ensure no residual tunnels
    # This call requires the authenticated ngrok token
    public_url = ngrok.connect(MLFLOW_PORT).public_url
    print(f"\n✅ MLflow Tracking UI is available at: {public_url}")
except Exception as e:
    print(f"\n❌ ERROR: Authentication or tunneling failed. Details: {e}")
    mlflow_process.terminate()

Waiting 10 seconds for the MLflow server to fully start...


ERROR:pyngrok.process.ngrok:t=2025-11-11T07:13:05+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-11-11T07:13:05+0000 lvl=eror msg="session closing" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"
ERROR:pyngrok.process.ngrok:t=2025-11-11T07:13:05+0000 lvl=eror msg="terminating with error" obj=app err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your aut


❌ ERROR: Authentication or tunneling failed. Details: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.


In [19]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.1


In [21]:
import threading
import subprocess
import time
from IPython.display import display, HTML

MLFLOW_PORT = 5000

def run_mlflow():
    # Start MLflow UI using subprocess.run (it blocks, so it must be threaded)
    print("Starting MLflow server...")
    subprocess.run(
        ['mlflow', 'ui', '--host', '0.0.0.0', '--port', str(MLFLOW_PORT)],
        check=True
    )

# 1. Kill any existing MLflow processes
!pkill -f 'mlflow'

# 2. Start the MLflow server in a separate thread
mlflow_thread = threading.Thread(target=run_mlflow, daemon=True)
mlflow_thread.start()

# 3. Wait for the server to initialize
print("Waiting 10 seconds for the MLflow server to initialize...")
time.sleep(10)

# 4. Display the access link using the standard Colab tunneling mechanism
# This creates a widget that forwards port 5000 to your browser.
access_html = f"""
<p>✅ **MLflow Server is running.** Click the link below to access the UI:</p>
<p>
    <a href="http://localhost:{MLFLOW_PORT}" target="_blank">
        <button style="background-color:#4CAF50;color:white;padding:10px 20px;border:none;border-radius:5px;cursor:pointer;">
            Open MLflow UI (Port {MLFLOW_PORT})
        </button>
    </a>
</p>
<p>
    (If the button does not work, check the "Port 5000" widget usually visible below this output.)
</p>
"""
display(HTML(access_html))

Starting MLflow server...
Waiting 10 seconds for the MLflow server to initialize...


In [1]:
!ngrok authtoken 35K7xxKpx4mtlEi128FJyQoBgte_332aL1iMgXnLPgPq61uSf

/bin/bash: line 1: ngrok: command not found


In [9]:
from pyngrok import conf
conf.get_default().auth_token = "35K7xxKpx4mtlEi128FJyQoBgte_332aL1iMgXnLPgPq61uSf"

In [4]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.1


In [10]:
import time
import subprocess
from pyngrok import ngrok

MLFLOW_PORT = 5000

# 1. Kill any existing ngrok tunnels and MLflow processes for a clean start
try:
    ngrok.kill()
    print("Killed existing ngrok tunnels.")
except:
    pass
!pkill -f 'mlflow'
time.sleep(1) # Give the system a moment to clear processes

# 2. Start MLflow UI in the background using subprocess (most reliable method)
print(f"Starting MLflow UI on port {MLFLOW_PORT}...")
mlflow_process = subprocess.Popen(
    ['mlflow', 'ui', '--host', '0.0.0.0', '--port', str(MLFLOW_PORT)],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# 3. Wait for the server to initialize (Crucial Step)
print("Waiting 10 seconds for the MLflow server to fully start...")
time.sleep(10)

# 4. Create the public ngrok tunnel using the now-authenticated client
try:
    # Use the public_url attribute directly after connection
    public_url = ngrok.connect(MLFLOW_PORT).public_url
    print(f"\n✅ MLflow Tracking UI is available at: {public_url}")
    print(f"Please copy and paste the URL above into your browser.")

except Exception as e:
    print(f"\n❌ ERROR: Failed to establish ngrok connection.")
    print(f"Details: {e}")
    # Terminate MLflow server process
    mlflow_process.terminate()

# Note: The server is running in the background. It will stop if you close the notebook.

Killed existing ngrok tunnels.
Starting MLflow UI on port 5000...
Waiting 10 seconds for the MLflow server to fully start...

✅ MLflow Tracking UI is available at: https://malignantly-unprompted-rhonda.ngrok-free.dev
Please copy and paste the URL above into your browser.


In [7]:
%pwd

'/content'

In [8]:
%cd '/content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4/'

/content/drive/MyDrive/NLP-Gen-AI-classroom/Assignment-4
