In [1]:
!pip install trl

Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Downloading trl-0.16.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.16.1


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [3]:
!pip install peft



In [4]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.4


In [5]:
import os
import json
import pandas as pd
import torch
import re
import ast
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from pathlib import Path
from kaggle_secrets import UserSecretsClient

In [6]:
# Get HuggingFace token
def get_hf_token():
    user_secrets = UserSecretsClient()
    return user_secrets.get_secret("HF_TOKEN")

def safe_eval(s):
    """Safely evaluate a string as a Python expression"""
    if not isinstance(s, str):
        return s
    try:
        return ast.literal_eval(s)
    except (SyntaxError, ValueError):
        return s

In [7]:
def load_employee_data(file_path="/kaggle/input/distressed-employees/distressed_employees_new.csv"):
    """Load and process employee data from CSV"""
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found at {file_path}")
        
        df = pd.read_csv(file_path)
        
        # Convert string representations of lists to actual lists
        for col in ['Problems', 'Other Problems']:
            if col in df.columns:
                df[col] = df[col].apply(lambda x: safe_eval(x) if isinstance(x, str) else x)
        
        return df
    except Exception as e:
        print(f"Error loading employee data: {e}")
        return None

In [8]:
def load_model_and_tokenizer():
    """Load the Mistral AI model and tokenizer with 4-bit quantization"""
    print("Loading Mistral model...")
    hf_token = get_hf_token()
    
    compute_dtype = torch.float16
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    )

    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        token=hf_token
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        trust_remote_code=True,
        padding_side="left",
        token=hf_token
    )
    
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

In [18]:
def extract_conversations_from_qna(chat_history):
    """Extract and format conversations from chat history"""
    if not chat_history:
        return "No chat history available"
        
    if isinstance(chat_history, str):
        # Check if it's a file path
        if os.path.exists(chat_history):
            try:
                with open(chat_history, 'r') as f:
                    chat_history = json.load(f)
            except json.JSONDecodeError:
                with open(chat_history, 'r') as f:
                    return f.read()
        else:
            # Try to parse as JSON string
            try:
                chat_history = json.loads(chat_history)
            except json.JSONDecodeError:
                return chat_history

    # Process based on structure
    formatted_chat = ""
    if isinstance(chat_history, list):
        for entry in chat_history:
            if isinstance(entry, dict):
                # Handle the specific format with direction and message
                if 'direction' in entry and 'message' in entry:
                    direction = entry.get('direction', '')
                    message = entry.get('message', '')
                    if direction == 'sent':
                        formatted_chat += f"User: {message}\n\n"
                    elif direction == 'received':
                        formatted_chat += f"Bot: {message}\n\n"
                # Handle standard chat format with role and content
                elif 'role' in entry and 'content' in entry:
                    role = entry.get('role', '')
                    content = entry.get('content', '')
                    formatted_chat += f"{role.capitalize()}: {content}\n\n"
                # Handle QnA format
                elif 'question' in entry and 'answer' in entry:
                    question = entry.get('question', '')
                    answer = entry.get('answer', '')
                    formatted_chat += f"User: {question}\nBot: {answer}\n\n"
                # Handle other formats
                elif any(key in entry for key in ['user', 'bot', 'assistant', 'system']):
                    for key, value in entry.items():
                        if isinstance(value, str) and value.strip():
                            formatted_chat += f"{key.capitalize()}: {value}\n\n"
            elif isinstance(entry, str):
                formatted_chat += f"{entry}\n\n"
    elif isinstance(chat_history, dict):
        # Handle different dictionary formats
        if 'messages' in chat_history:
            return extract_conversations_from_qna(chat_history['messages'])
        elif 'history' in chat_history:
            return extract_conversations_from_qna(chat_history['history'])
        else:
            for key, value in chat_history.items():
                if isinstance(value, str) and value.strip():
                    formatted_chat += f"{key.capitalize()}: {value}\n\n"
    else:
        formatted_chat = str(chat_history)
    
    return formatted_chat.strip()

In [10]:
def format_employee_data(employee_data):
    """Format employee data for the prompt"""
    formatted_data = {}
    
    # Basic info
    formatted_data["Employee ID"] = employee_data.get("Employee_ID", "")
    formatted_data["Average Work Hours"] = employee_data.get("Average Work Hours", "")
    formatted_data["Performance Rating"] = employee_data.get("Performance Rating", "")
    formatted_data["Reward Factor"] = employee_data.get("Reward Factor", "")
    formatted_data["Vibe Factor"] = employee_data.get("Vibe Factor", "")
    formatted_data["Anomaly Score"] = employee_data.get("Anamaly_Score", "")
    
    # Problems
    formatted_data["Problems"] = []
    problems = employee_data.get("Problems", [])
    if problems and isinstance(problems, list):
        for problem in problems:
            if isinstance(problem, list) and len(problem) >= 2:
                formatted_data["Problems"].append({
                    "issue": problem[0],
                    "score": problem[1]
                })
    
    # Other Problems
    formatted_data["Other Problems"] = []
    other_problems = employee_data.get("Other Problems", [])
    if other_problems and isinstance(other_problems, list):
        for problem in other_problems:
            if isinstance(problem, list) and len(problem) >= 2:
                formatted_data["Other Problems"].append({
                    "issue": problem[0],
                    "score": problem[1]
                })
    
    return json.dumps(formatted_data, indent=2)

In [21]:
def generate_hr_report(model, tokenizer, employee_id, employee_data, chat_history):
    """Generate HR report using the Mistral model"""
    processed_chat = extract_conversations_from_qna(chat_history)
    formatted_data = format_employee_data(employee_data)
    
    # Create the prompt for Mistral format
    prompt = f"""<s>[INST]
    You are an AI assistant that analyzes conversations between employees and a chatbot, then creates comprehensive HR reports based on distressed employee data. Your task is to:
    
    1. Analyze conversation tone/content
    2. Identify employee concerns
    3. Extract engagement/satisfaction insights
    4. Assess risks
    5. Recommend actionable steps
    
    CRITICAL INSTRUCTION: NEVER use "N/A" or placeholder text anywhere in your report. If data appears missing, make reasonable inferences based on available information or provide generic but meaningful content.
    
    Create a STRUCTURED report with these REQUIRED sections:
    
    ## 💼 Employee Summary
    • Basic Info:
      - Employee ID: {employee_id}
      - Problem indicators (from Problems and Other Problems columns)
      - Work hours and performance metrics
    • Quantitative Metrics:
      - Anomaly Score interpretation
      - Vibe Factor analysis
      - Reward Factor evaluation
      - Performance Rating context
      - Average Work Hours assessment
    • Qualitative Analysis:
      - Top issues identified in Problems column
      - Secondary issues from Other Problems column
      - Risk patterns and correlations
    
    ## 🔍 Key Insights
    • Technical Observations:
      - Analyze highest-scoring problem factors 
      - Identify patterns across problem indicators
    • Quantitative Analysis:
      - Compare metrics to normal ranges
      - Analyze correlation between Problems and numeric indicators
    
    ## 🚨 Risk Assessment
    • Concerns:
      - Low-level concerns (scores 0.1-0.3)
      - Medium-level concerns (scores 0.3-0.6)
      - High-level concerns (scores >0.6)
    • Anomalies:
      - Interpret Anomaly Score relative to company average
      - Analyze unusual patterns in Problems and Other Problems
    • Indicators:
      - Connect conversation content with identified problem areas
      - Highlight behavioral indicators from chat history
    
    ## 📈 Recommended Actions
    • Critical Steps:
      - 3-5 highest priority actions targeting top problem areas
      - Specific responsibility assignments (HR/manager/employee)
      - Clear timelines for implementation
    • Additional Considerations:
      - Follow-up procedures based on Anomaly Score
      - Support resources for identified Problems
      - Preventive measures for potential issues
    
    FORMATTING RULES:
    ✅ Use EXACT section headers with emojis as shown above
    ✅ Bold subsection headers using ** **
    ✅ Each bullet point must contain 15+ meaningful words
    ✅ Use • for main points and + for subpoints
    ✅ Include ALL data points from employee information
    
    CRITICAL REQUIREMENTS:
    ✅ NO PLACEHOLDERS - never use "N/A," "TBD," or similar text
    ✅ Balanced section lengths
    ✅ Complete action items addressing ALL risks
    ✅ Specific timelines and responsibility assignments
    
    If you're running out of space, be more concise in earlier sections but NEVER use placeholder text or leave sections incomplete.
    <</SYS>>
    
    Employee: {employee_id}
    Data: {formatted_data}
    Conversation: {processed_chat}
    
    Generate a COMPLETE HR report with NO placeholder text like "N/A". Every section must contain meaningful content based on the distressed employees dataset structure and chat history. [/INST]
    """
    
    # Generate the report
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(
        **inputs,
        max_length=4096,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.15
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract just the response part (after the prompt)
    response_parts = response.split("[/INST]")
    if len(response_parts) > 1:
        response = response_parts[-1].strip()
    else:
        # If splitting failed, try to extract the report content
        start_idx = response.find("## 💼 Employee Summary")
        if start_idx != -1:
            response = response[start_idx:].strip()
    
    return response

In [23]:
def convert_markdown_to_structured_json(markdown_text):
    """Convert markdown HR report to structured JSON using line-by-line parsing"""
    # Initialize the structure
    report = {
        "Employee Summary": {
            "Basic Info": [],
            "Quantitative Metrics": [],
            "Qualitative Metrics": []
        },
        "Key Insights": {
            "Technical Observations": [],
            "Quantitative Analysis": []
        },
        "Risk Assessment": {
            "Concerns": [],
            "Anomalies": [],
            "Indicators": []
        },
        "Recommended Actions": {
            "Critical Steps": [],
            "Additional Considerations": []
        }
    }
    
    # Clean the markdown text by removing indentation
    cleaned_lines = [line.lstrip() for line in markdown_text.split('\n')]
    
    current_section = None
    current_subsection = None
    
    for line in cleaned_lines:
        if not line:
            continue
            
        # Check for main section headers
        if line.startswith('## '):
            section_title = line[3:].strip()
            clean_title = re.sub(r'[^\w\s]', '', section_title).strip()
            
            for key in report.keys():
                if clean_title.lower() in key.lower() or key.lower() in clean_title.lower():
                    current_section = key
                    current_subsection = None
                    break
                
        # Check for subsection headers
        elif line.startswith('### '):
            if not current_section:
                continue
                
            subsection_title = line[4:].strip()
            if subsection_title.endswith(':'):
                subsection_title = subsection_title[:-1]
                
            # Find matching subsection
            for key in report[current_section].keys():
                if subsection_title.lower() in key.lower() or key.lower() in subsection_title.lower():
                    current_subsection = key
                    break
        
        # Check for bullet points (handles different styles)
        elif (line.startswith('-') or line.startswith('•') or line.startswith('+') or 
              line.startswith('1.') or line.startswith('2.') or line.startswith('3.') or 
              line.startswith('4.') or line.startswith('5.')):
            
            if not current_section or not current_subsection:
                continue
                
            # Extract bullet text (everything after the bullet marker)
            marker_end = line.find(' ')
            if marker_end != -1:
                bullet_text = line[marker_end+1:].strip()
                if bullet_text and bullet_text != "**":
                    # Handle cases where there might be bold formatting
                    if bullet_text.startswith('**') and '**:' in bullet_text:
                        # Skip subsection headers in bullet format
                        continue
                    report[current_section][current_subsection].append(bullet_text)
    
    # Add this code here, just before the return statement
    # Remove empty arrays from the report
    for section, subsections in report.items():
        for subsection in list(subsections.keys()):
            if not subsections[subsection]:
                del subsections[subsection]
    
    return report

In [13]:
def save_report_to_json(report_data, employee_id, output_dir="./reports"):
    """Save the generated report to a JSON file"""
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Create filename
    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    filename = output_path / f"hr_report_{employee_id}_{timestamp}.json"
    
    # Save JSON
    with open(filename, 'w') as f:
        json.dump(report_data, f, indent=2)
    
    print(f"Report saved to {filename}")
    return str(filename)

In [14]:
def generate_hr_summary(employee_id, chat_history=None, output_dir="./reports"):
    """End-to-end process to generate HR summary"""
    print(f"Generating HR summary for employee {employee_id}")
    
    # Load employee data
    file_path = "/kaggle/input/distressed-employees/distressed_employees_new.csv"
    df = load_employee_data(file_path)
    
    if df is None:
        return {"error": "Failed to load employee data"}
    
    # Filter data for the specified employee
    employee_rows = df[df['Employee_ID'] == employee_id]
    
    if len(employee_rows) == 0:
        return {"error": f"Employee ID {employee_id} not found in the dataset"}
    
    employee_data = employee_rows.iloc[0].to_dict()
    
    # Load model and tokenizer
    model, tokenizer = load_model_and_tokenizer()
    
    # Generate report
    markdown_report = generate_hr_report(model, tokenizer, employee_id, employee_data, chat_history)
    
    # Convert markdown to structured JSON
    structured_report = convert_markdown_to_structured_json(markdown_report)
    
    # Prepare final report
    final_report = {
        "employee_id": employee_id,
        "report_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S"),
        "report": structured_report,
        "raw_markdown": markdown_report
    }
    
    # Save report
    report_file = save_report_to_json(final_report, employee_id, output_dir)
    
    return {
        "status": "success",
        "employee_id": employee_id,
        "report_file": report_file
    }

In [24]:
# Main execution function with command line arguments
def main():
    import argparse
    
    parser = argparse.ArgumentParser(description='Generate HR summary reports')
    parser.add_argument('--employee_id', type=str, required=True, help='Employee ID')
    parser.add_argument('--chat_file', type=str, help='Path to chat history JSON file')
    parser.add_argument('--output_dir', type=str, default='./reports', help='Output directory for reports')
    
    args = parser.parse_args()
    
    # Generate report
    result = generate_hr_summary(args.employee_id, args.chat_file, args.output_dir)
    print(json.dumps(result, indent=2))

# Replace the main() function with this code
if __name__ == "__main__":
    # Get employee ID directly from input
    employee_id = input("Enter Employee ID (e.g., EMP0418): ")
    
    # Optional: Get chat file path (or leave empty)
    chat_file = input("Enter chat file path (optional, press Enter to skip): ") or None
    
    # Use default output directory or customize
    output_dir = "./reports"
    
    # Generate report
    result = generate_hr_summary(employee_id, chat_file, output_dir)
    print(json.dumps(result, indent=2))

Enter Employee ID (e.g., EMP0418):  EMP0040
Enter chat file path (optional, press Enter to skip):  /kaggle/input/chat-history/chat_history.json


Generating HR summary for employee EMP0040
Loading Mistral model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Report saved to reports/hr_report_EMP0040_20250407_140420.json
{
  "status": "success",
  "employee_id": "EMP0040",
  "report_file": "reports/hr_report_EMP0040_20250407_140420.json"
}


In [26]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from kaggle_secrets import UserSecretsClient

def get_hf_token():
    """Get HuggingFace token from Kaggle secrets"""
    user_secrets = UserSecretsClient()
    return user_secrets.get_secret("HF_TOKEN")

def download_and_save_model(model_name="mistralai/Mistral-7B-Instruct-v0.2", output_dir="saved_model", use_token=True):
    """
    Downloads a model from Hugging Face and saves it to a local directory
    
    Args:
        model_name: HuggingFace model name
        output_dir: Directory to save the model
        use_token: Whether to use HF token from Kaggle secrets
    """
    print(f"Downloading model {model_name}...")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get HF token if needed
    hf_token = get_hf_token() if use_token else None
    
    # Configure quantization for memory efficiency
    compute_dtype = torch.float16
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    )
    
    # Download model
    print("Loading Mistral model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        token=hf_token
    )
    
    # Save the model to local directory
    model_path = os.path.join(output_dir, "model")
    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)
    print(f"Model saved to {model_path}")
    
    # Download and save tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
        padding_side="left",
        token=hf_token
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer_path = os.path.join(output_dir, "tokenizer")
    os.makedirs(tokenizer_path, exist_ok=True)
    tokenizer.save_pretrained(tokenizer_path)
    print(f"Tokenizer saved to {tokenizer_path}")
    
    print(f"Model and tokenizer saved to {output_dir}")
    return model, tokenizer, output_dir

def load_saved_model(model_dir="saved_model"):
    """
    Loads a previously downloaded model and tokenizer
    
    Args:
        model_dir: Directory containing the saved model and tokenizer
    """
    model_path = os.path.join(model_dir, "model")
    tokenizer_path = os.path.join(model_dir, "tokenizer")
    
    if not os.path.exists(model_path) or not os.path.exists(tokenizer_path):
        print(f"Model or tokenizer not found in {model_dir}. Downloading...")
        return download_and_save_model(output_dir=model_dir)[0:2]
    
    print(f"Loading model from {model_path}...")
    
    # Configure quantization
    compute_dtype = torch.float16
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True
    )
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_path,
        trust_remote_code=True,
        padding_side="left"
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

# Usage examples:
# Option 1: Download, save and get the model and tokenizer objects
# model, tokenizer, saved_dir = download_and_save_model()

# Option 2: Just save the model for later use
# _, _, saved_dir = download_and_save_model()

# Option 3: Load a previously saved model
# model, tokenizer = load_saved_model("saved_model")

# Execute the function
model, tokenizer, saved_dir = download_and_save_model()

Downloading model mistralai/Mistral-7B-Instruct-v0.2...
Loading Mistral model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model saved to saved_model/model
Tokenizer saved to saved_model/tokenizer
Model and tokenizer saved to saved_model


In [27]:
import shutil

# Zip the model directory
output_zip = 'modelsumz'
shutil.make_archive(output_zip, 'zip', '/kaggle/working/saved_model')

'/kaggle/working/modelsumz.zip'

In [28]:
from IPython.display import FileLink
FileLink('/kaggle/working/modelsumz.zip')