<a href="https://colab.research.google.com/github/TS-Group5/Capstone_Project/blob/main/resume_summery_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install PyPDF2 python-docx torch transformers tqdm language-tool-python

In [None]:
!huggingface-cli login

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    PegasusTokenizer,
    PegasusForConditionalGeneration,
    T5Tokenizer,
    T5ForConditionalGeneration,
    GPT2Tokenizer,
    GPT2LMHeadModel,
)
from tqdm import tqdm
import PyPDF2
from docx import Document
import pathlib
import language_tool_python

# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN_INPUT = 512
MAX_NEW_TOKENS = 200  # Increased token generation
NUM_BEAMS = 4
MODEL_SAVE_DIR = "./fine_tuned_resume_models/"

In [None]:
def load_resume_dataset(filepath):
    """
    Load resume dataset from CSV with Category and Resume columns.

    Args:
        filepath (str): Path to the resume dataset CSV

    Returns:
        tuple: Lists of resume texts and corresponding categories/summaries
    """
    try:
        df = pd.read_csv(filepath)

        # Validate required columns
        required_columns = ['Category', 'Resume']
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"CSV must contain columns: {required_columns}")

        # Clean and preprocess data
        resume_texts = df['Resume'].fillna('').apply(preprocess_resume_text).tolist()
        categories = df['Category'].fillna('').tolist()

        # Generate summary prompts based on categories
        summaries = [f"Professional summary for a {category} role" for category in categories]

        return resume_texts, summaries, categories

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return [], [], []

In [None]:
def preprocess_resume_text(text):
    """Preprocess text by removing excessive whitespace and special characters."""
    cleaned_text = ' '.join(str(text).split()).replace('\n', ' ')
    return cleaned_text

In [None]:
def split_dataset(texts, summaries, categories, train_ratio=0.8):
    """
    Split dataset into training and validation sets.

    Args:
        texts (list): List of resume texts
        summaries (list): List of corresponding summaries
        categories (list): List of resume categories
        train_ratio (float): Ratio of training data (default: 0.8)

    Returns:
        tuple: Training and validation datasets with texts, summaries, and categories
    """
    total_samples = len(texts)
    train_size = int(total_samples * train_ratio)

    # Create indices
    torch.manual_seed(42)  # For reproducibility

    # Shuffle indices
    shuffled_indices = torch.randperm(total_samples).tolist()

    # Split indices
    train_indices = shuffled_indices[:train_size]
    val_indices = shuffled_indices[train_size:]

    # Create train and validation sets
    train_texts = [texts[i] for i in train_indices]
    train_summaries = [summaries[i] for i in train_indices]
    train_categories = [categories[i] for i in train_indices]

    val_texts = [texts[i] for i in val_indices]
    val_summaries = [summaries[i] for i in val_indices]
    val_categories = [categories[i] for i in val_indices]

    return (train_texts, train_summaries, train_categories), (val_texts, val_summaries, val_categories)

In [None]:
def choose_model():
    """Prompt the user to choose a summarization model."""
    print("Choose a summarization model:")
    print("1. Pegasus (google/pegasus-xsum)")
    print("2. BART (facebook/bart-large-cnn)")
    print("3. T5 (t5-small)")
    print("4. GPT-2 (gpt2)")
    choice = input("Enter the number of your choice (default: 2): ").strip()

    if choice == "1":
        model_name = "google/pegasus-xsum"
        tokenizer_class = PegasusTokenizer
        model_class = PegasusForConditionalGeneration
    elif choice == "3":
        model_name = "t5-small"
        tokenizer_class = T5Tokenizer
        model_class = T5ForConditionalGeneration
    elif choice == "4":
        model_name = "gpt2"
        tokenizer_class = GPT2Tokenizer
        model_class = GPT2LMHeadModel
    else:
        model_name = "facebook/bart-large-cnn"
        tokenizer_class = BartTokenizer
        model_class = BartForConditionalGeneration

    return model_name, tokenizer_class, model_class


In [None]:
# Generate Summary with Fine-tuned Model
def summarize_text_with_prompt(model, tokenizer, text, prompt, max_len_input=512, max_new_tokens=150, device="cuda"):
    model.eval()

    # Prepare input
    full_input = f"{prompt}\n\n{text}"
    input_ids = tokenizer.encode(full_input, max_length=max_len_input, truncation=True, return_tensors="pt").to(device)

    # Create attention mask
    attention_mask = torch.ones_like(input_ids).to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_beams=NUM_BEAMS,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
# Optional grammar correction
def correct_grammar(summary):
    """Post-process the generated summary for grammatical correctness."""
    try:
        import language_tool_python
        tool = language_tool_python.LanguageTool("en-US")
        matches = tool.check(summary)
        corrected_summary = language_tool_python.utils.correct(summary, matches)
        return corrected_summary
    except Exception as e:
        print(f"Grammar correction failed: {e}")
        return summary

In [None]:
class ResumeDataset(Dataset):
    """Dataset for fine-tuning summarization models."""
    def __init__(self, texts, summaries, tokenizer, max_len_input, max_len_output):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len_input,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            self.summaries[idx],
            max_length=self.max_len_output,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": input_encoding["input_ids"].squeeze(),
            "attention_mask": input_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze(),
        }

In [None]:
def create_dynamic_prompts(categories):
    """
    Create more descriptive and comprehensive prompts based on resume categories.

    Args:
        categories (list): List of resume categories

    Returns:
        dict: Mapping of categories to specific, detailed summary prompts
    """
    dynamic_prompts = {}
    for category in set(categories):
        dynamic_prompts[category] = {
            "comprehensive": f"""Craft a comprehensive professional summary for a {category} professional that:
            - Highlights the most significant career achievements
            - Details technical expertise and core competencies
            - Emphasizes quantifiable impact and key performance metrics
            - Showcases unique value proposition in the {category} domain
            - Provides a strategic overview of professional growth and potential""",

            "experience_focused": f"""Generate an experience-driven summary for a {category} professional that:
            - Chronologically highlights career progression
            - Describes pivotal roles and transformative projects
            - Quantifies professional accomplishments with specific metrics
            - Illustrates technical skills and industry expertise
            - Demonstrates leadership and innovation in the {category} field""",

            "skills_and_impact": f"""Create a skills-centric summary for a {category} professional that:
            - Lists advanced technical and soft skills
            - Connects skills to tangible business outcomes
            - Highlights certifications and specialized training
            - Demonstrates versatility and adaptability
            - Shows potential for driving organizational success in {category} roles""",

            "strategic_profile": f"""Develop a strategic professional profile for a {category} expert that:
            - Provides a holistic view of professional capabilities
            - Connects past experiences to future potential
            - Highlights innovative approaches and problem-solving skills
            - Demonstrates industry thought leadership
            - Articulates unique professional narrative in the {category} space"""
        }
    return dynamic_prompts

In [None]:
def fine_tune_model_with_dataset(
    model,
    tokenizer,
    train_texts,
    train_summaries,
    val_texts=None,
    val_summaries=None,
    max_len_input=512,
    max_len_output=150,
    epochs=3,
    batch_size=4,
    model_name="resume_summary_model",
    stop_loss_threshold=0.5
):
    """
    Enhanced fine-tuning with model saving and optional validation.
    """
    # Ensure model save directory exists
    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

    # Create datasets
    train_dataset = ResumeDataset(train_texts, train_summaries, tokenizer, max_len_input, max_len_output)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Optional validation dataset
    val_loader = None
    if val_texts and val_summaries:
        val_dataset = ResumeDataset(val_texts, val_summaries, tokenizer, max_len_input, max_len_output)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training setup
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        # Training loop
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss
            loss.backward()

            train_loss += loss.item()

            optimizer.step()
            optimizer.zero_grad()

        # Validation if loader exists
        if val_loader:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for batch in val_loader:
                    input_ids = batch["input_ids"].to(DEVICE)
                    attention_mask = batch["attention_mask"].to(DEVICE)
                    labels = batch["labels"].to(DEVICE)

                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                    )
                    val_loss += outputs.loss.item()

            val_loss /= len(val_loader)
            print(f"Epoch {epoch+1}: Train Loss = {train_loss/len(train_loader):.4f}, "
                  f"Validation Loss = {val_loss:.4f}")

            # Save the best model based on validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                model_save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_best.pt")
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': best_val_loss
                }, model_save_path)
                print(f"Saved best model to {model_save_path}")
        else:
            print(f"Epoch {epoch+1}: Train Loss = {train_loss/len(train_loader):.4f}")

    # Save final model
    final_model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_final.pt")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, final_model_path)
    print(f"Final model saved to {final_model_path}")

    print("Fine-tuning complete!")
    return model

In [None]:
def main():
    # Kaggle Dataset Path
    kaggle_dataset_path = input("Enter path to resume dataset CSV: ").strip()

    # Load Dataset
    resume_texts, summaries, categories = load_resume_dataset(kaggle_dataset_path)

    if not resume_texts:
        print("No valid resume data found.")
        return

    # Create dynamic prompts based on categories
    dynamic_prompts = create_dynamic_prompts(categories)

    # Split Dataset
    (train_texts, train_summaries, train_categories), (val_texts, val_summaries, val_categories) = split_dataset(
        resume_texts, summaries, categories
    )

    # Choose Model
    model_name, tokenizer_class, model_class = choose_model()
    tokenizer = None
    if model_name == 't5-small':
        tokenizer = tokenizer_class.from_pretrained(model_name, legacy=False)
    else:
        tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name).to(DEVICE)

    # Fine-tune Model
    fine_tuned_model = fine_tune_model_with_dataset(
        model,
        tokenizer,
        train_texts,
        train_summaries,
        val_texts,
        val_summaries,
        model_name=model_name.replace('/', '_'),
        stop_loss_threshold=0.5
    )

    # Resume Path for Summary Generation
    resume_path = input("Enter the path to the resume file to generate summary: ").strip()
    resume_category = input("Enter the category for this resume (e.g., Java Developer, Data Scientist): ").strip()

    # Extract text (function remains the same as in previous script)
    def extract_text(resume_path):
        import os
        import PyPDF2
        from docx import Document

        file_extension = os.path.splitext(resume_path)[1].lower()

        try:
            if file_extension == '.pdf':
                with open(resume_path, "rb") as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    text = ""
                    for page in pdf_reader.pages:
                        text += page.extract_text() + "\n"
                    return text.strip()
            elif file_extension in ['.docx', '.doc']:
                doc = Document(resume_path)
                text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
                return text.strip()
            else:
                with open(resume_path, 'r', encoding='utf-8') as file:
                    return file.read().strip()
        except Exception as e:
            print(f"Error reading file: {e}")
            return None

    resume_text = extract_text(resume_path)

    if not resume_text or not resume_text.strip():
        print("Failed to extract text or the file is empty.")
        return

    # Prompt selection
    print("\nSelect summary format:")
    category_prompts = dynamic_prompts.get(resume_category, dynamic_prompts.get('default', {}))

    if not category_prompts:
        category_prompts = {
            "comprehensive": f"Generate a comprehensive summary for a {resume_category} professional.",
            "experience_focused": f"Create an experience-driven summary for a {resume_category} role.",
            "skills_and_impact": f"Develop a skills-centric profile for a {resume_category} professional.",
            "strategic_profile": f"Craft a strategic overview for a {resume_category} expert."
        }

    for i, (key, desc) in enumerate(category_prompts.items(), 1):
        print(f"{i}. {desc}")

    choice = int(input("Enter the number of your choice: ").strip()) - 1
    prompt_key = list(category_prompts.keys())[choice]
    prompt = category_prompts[prompt_key]

  # Generate summary (same as before)
    summary = summarize_text_with_prompt(
        fine_tuned_model,
        tokenizer,
        resume_text,
        prompt,
        MAX_LEN_INPUT,
        MAX_NEW_TOKENS,
        DEVICE
    )

    # Grammar correction (same as before)
    summary = correct_grammar(summary)
    print(f"\nGenerated Summary ({prompt_key.capitalize()}):\n", summary)

    # Save Summary
    output_path = f"{resume_path.rsplit('.', 1)[0]}_{resume_category}_{prompt_key}_summary.txt"
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(summary)
    print(f"\nSummary saved to: {output_path}")

    # Retrain with new resume data
    retrain_choice = input("Do you want to retrain the model with this new resume? (yes/no): ").strip().lower()
    if retrain_choice == 'yes':
        # Prepare new training data
        new_train_texts = train_texts + [resume_text]
        new_train_summaries = train_summaries + [summary]

        # Fine-tune model again with new data
        fine_tuned_model = fine_tune_model_with_dataset(
            fine_tuned_model,
            tokenizer,
            new_train_texts,
            new_train_summaries,
            val_texts,
            val_summaries,
            model_name=f"{model_name.replace('/', '_')}_updated"
        )

if __name__ == "__main__":
    main()

## **Text-to-Audio** - Working



In [None]:
!pip -q install torch torchaudio transformers TTS soundfile speechbrain

In [None]:
#!pip install git+https://github.com/speechbrain/speechbrain
!pip -q install speechbrain

In [None]:
from speechbrain.pretrained import Tacotron2, HIFIGAN
import torchaudio

# Load pre-trained TTS model
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")

# Input text
text = "Hello, this is a text-to-audio conversion example using machine learning."

# Generate mel-spectrogram from text
mel_output, _, _ = tacotron2.encode_text(text)

# Generate audio waveform from mel-spectrogram
waveforms = hifi_gan.decode_batch(mel_output)

# Save to a file
torchaudio.save("output_audio.wav", waveforms.squeeze(1), 22050)

print("Audio saved to output_audio.wav")


# **USING Mozilla TTS** - Inprogress

In [None]:
!pip -q install TTS

In [None]:
import TTS
from TTS.api import TTS

# Initialize the TTS model
try:
    #tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=True, gpu=False)
    tts = TTS(model_name="tts_models/multispeaker/en/vctk", progress_bar=True, gpu=False)
    # Check if speakers are available
    speakers = tts.speakers if hasattr(tts, 'speakers') else []

    if speakers:
        print("Available speakers:", speakers)

        # Choose a speaker
        speaker = input("Enter the speaker name from the above list (or press Enter for default): ") or speakers[0]
    else:
        print("No multiple speakers available for this model.")
        speaker = None

    # Input text
    text = input("Enter the text to synthesize: ")

    # Generate speech
    output_path = "output_audio.wav"

    # Handle speaker parameter differently based on availability
    if speaker:
        tts.tts_to_file(text=text, speaker=speaker, file_path=output_path)
    else:
        tts.tts_to_file(text=text, file_path=output_path)

    print(f"Audio saved to {output_path}")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check:")
    print("1. Is the TTS library installed correctly?")
    print("2. Are you using the latest version of the TTS library?")
    print("3. Is the model name correct?")

# **Using PYTTSX** - Inprogress

In [None]:
!pip -q install pyttsx3

In [None]:
import pyttsx3
import os

def save_audio(text, voice_gender='male', filename='output.mp3'):
    engine = pyttsx3.init()

    # Get available voices
    voices = engine.getProperty('voices')

    # Validate and select the voice based on gender
    if voice_gender == 'male':
        voice_index = 17
    elif voice_gender == 'female':
        voice_index = 12
    else:
        print("Invalid voice gender specified. Use 'male' or 'female'.")
        return

    # Validate voice index
    if voice_index < 0 or voice_index >= len(voices):
        print(f"Voice index {voice_index} is out of range. Check available voices.")
        return

    selected_voice = voices[voice_index]
    engine.setProperty('voice', selected_voice.id)
    print(f"Selected voice: {selected_voice.name} (ID: {selected_voice.id})")

    # Adjust speech properties
    engine.setProperty('rate', 200)  # Speaking rate
    engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

    # Ensure the output file does not already exist
    if os.path.exists(filename):
        os.remove(filename)

    # Split the text into chunks (optional)
    chunk_size = 100  # Adjust as needed
    text_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Save audio for each chunk
    for i, chunk in enumerate(text_chunks):
        temp_filename = f"temp_chunk_{i}.mp3"
        print(f"Saving chunk {i+1}: {chunk}")
        engine.save_to_file(chunk, temp_filename)
        engine.runAndWait()

        # Optionally, append chunks into the final file (if needed)
        # If saving in the same file, you could use a library like `pydub` to merge audio files.

    print(f"Final audio saved as {filename}")

# Example usage
text = "Hello! This is a text-to-speech conversion example to verify the correct voice and text handling."

# Save audio with male voice
save_audio(text, 'male', 'male_voice.mp3')

# Save audio with female voice
save_audio(text, 'female', 'female_voice.mp3')


# **Play the Generated Audio** - Not Required

In [None]:
!apt-get install libportaudio2 # install the portaudio dependency
!pip -q install sounddevice # install the sounddevice library

In [None]:
import librosa
import soundfile as sf

# Load the generated audio
y, sr = librosa.load("output_audio.wav", sr=None)

# Lower the pitch (-4 semitones for male-like voice)
y_lowered = librosa.effects.pitch_shift(y, sr, n_steps=-4)

# Save the modified audio
sf.write("male_voice.wav", y_lowered, sr)

print("Modified audio saved as 'male_voice.wav'")