In [None]:
!pip install sentence_transformers transformers accelerate bitsandbytes datasets openpyxl pandas textstat

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json
import pandas as pd
from typing import List, Dict
from pydantic import BaseModel, Field, validator

# Initialize the LLaMA model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
access_token = "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"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True, use_auth_token=access_token)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=access_token)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define Pydantic models for validation
class ExcerptCode(BaseModel):
    excerpt: str = Field(description="Relevant excerpt from the transcript")
    code: str = Field(description="Code generated that best represents the excerpts identified")

    @validator("code")
    def code_is_not_long(cls, value):
        size_code = len(value.split())
        if size_code < 2 or size_code > 5:
            raise ValueError(f"Each code must be between two to five words long. The code '{value}' is {size_code} words long.")
        return value

class ExcerptCodes(BaseModel):
    excerpt_code: List[ExcerptCode] = Field(description="List of excerpt and code generated")

class SubTheme(BaseModel):
    sub_theme: str = Field(description="Sub-theme identified under a main theme")

class Theme(BaseModel):
    theme: str = Field(description="Main theme that was identified")
    sub_themes: List[SubTheme] = Field(description="List of sub-themes under the main theme")
    codes: List[str] = Field(description="List of codes categorized by theme identified")

class Themes(BaseModel):
    themes: List[Theme] = Field(description="List of themes identified")

    @validator("themes")
    def themes_is_not_long(cls, value):
        size_list_themes = len(value)
        if size_list_themes > 6:
            raise ValueError(f"The number of themes must not exceed 6. {size_list_themes} were generated.")
        return value

def _clean_themes_data(themes):
    unique_codes = set()
    themes_with_unique_codes = []

    for theme_data in themes["themes"]:
        theme = theme_data["theme"]
        codes = theme_data["codes"]
        cleaned_codes = list(set([code.replace("_", " ") for code in codes]))
        unique_codes.update(cleaned_codes)
        sub_themes = [sub_theme.sub_theme for sub_theme in theme_data.get("sub_themes", [])]
        theme_entry = {
            "theme": theme,
            "sub_themes": sub_themes,
            "codes": list(set(cleaned_codes))
        }
        themes_with_unique_codes.append(theme_entry)
    themes["themes"] = themes_with_unique_codes
    return themes

def _clean_codes_data(codes):
    return [{"excerpt": k, "code": v.replace("_", " ")} for k, v in json.loads(codes).items()]

def parse_codes(_codes, _themes):
    cleaned_data_dict = {"Theme": [], "Sub-Themes": [], "Codes": [], "Excerpts from transcript": []}
    json1_data = json.loads(_codes)
    json2_data = json.loads(_themes)

    for theme_data in json2_data["themes"]:
        theme = theme_data["theme"]
        sub_themes = ", ".join(theme_data.get("sub_themes", []))
        codes = set(theme_data["codes"])
        code = ", ".join([d.replace("_", " ") for d in codes])
        cleaned_data_dict["Theme"].append(theme)
        cleaned_data_dict["Sub-Themes"].append(sub_themes)
        cleaned_data_dict["Codes"].append(code)
        excerpts_combined = []
        for c in codes:
            if c in json1_data.keys():
                excerpts_combined.append(json1_data[c])
        excerpts_combined = set(excerpts_combined)
        cleaned_data_dict["Excerpts from transcript"].append(", ".join(excerpts_combined))

    return pd.DataFrame(cleaned_data_dict)

def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7) -> str:
    """Generate text from a given prompt using the LLaMA model."""
    result = generator(prompt, max_new_tokens=max_new_tokens, temperature=temperature, num_return_sequences=1)
    return result[0]['generated_text'].strip()

def process_transcript(file_path: str):
    # Read the transcript from the .txt file
    with open(file_path, 'r') as file:
        transcript = file.read()

    # Generate summary
    summary_prompt = f"Summarize the following interview transcript:\n{transcript}\nSummary:"
    summary = generate_text(summary_prompt, max_new_tokens=200, temperature=0.5).strip()

    # Generate codes
    codes_prompt = f"""
    Review the following interview transcript and identify key phrases or codes that represent significant parts of the conversation. Each code should be a short phrase or term.

    Transcript:
    {transcript}

    Codes:"""
    codes = generate_text(codes_prompt, max_new_tokens=300, temperature=0.7).strip()

    # Generate themes and sub-themes
    themes_prompt = f"""
    Based on the following codes, identify broad themes that categorize the key points from the interview transcript. For each theme, identify relevant sub-themes.

    Codes:
    {codes}

    Themes and Sub-Themes:"""
    themes = generate_text(themes_prompt, max_new_tokens=300, temperature=0.7).strip()

    # Debugging: Print outputs
    print("Generated Codes:\n", codes)
    print("Generated Themes:\n", themes)

    # Convert the themes and codes into the required JSON format
    try:
        cleaned_codes = _clean_codes_data(codes)
        cleaned_themes = _clean_themes_data(json.loads(themes))
        return summary, json.dumps(cleaned_codes), json.dumps(cleaned_themes)
    except json.JSONDecodeError as e:
        print("Error decoding JSON:", e)
        print("Codes data:", codes)
        print("Themes data:", themes)
        return summary, "Error in codes", "Error in themes"
    except Exception as e:
        print("Unexpected error:", e)
        return summary, "Error processing codes", "Error processing themes"

def save_to_excel(summary: str, codes: str, themes: str, output_file: str):
    # Create a DataFrame with the summary, codes, and themes
    df = pd.DataFrame({
        "Summary": [summary],
        "Codes": [codes],
        "Themes": [themes]
    })

    # Save the DataFrame to an Excel file
    df.to_excel(output_file, index=False)

# Example usage
input_file = "/content/TA_DATA.txt"  # Replace with your input file path
output_file = "analysis_output.xlsx"  # Output Excel file path

# Process the transcript and save results to an Excel file
summary, codes, themes = process_transcript(input_file)
save_to_excel(summary, codes, themes, output_file)

print("Analysis saved to", output_file)
