In [4]:
## Importing Necessary Libraries

import pandas as pd
import yt_dlp
import os
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
import speech_recognition as sr
import time
import os
import speech_recognition as sr
from pydub import AudioSegment
from tqdm import tqdm

## Load CSV data Extract their audio (.wav)

In [2]:
# Load the CSV file (assuming it's in the same directory as the script)
file_path = 'D:\excel.csv'  # Ensure this matches your file name
print("Loading CSV file...")
excel_data = pd.read_csv(file_path)
urls = excel_data['UPSC Interview Video Link']

# Print the URLs to verify they are loaded correctly
print("URLs loaded from file:")
print(urls)

# Directory to save audio files
output_dir = r'output_audio_files'
os.makedirs(output_dir, exist_ok=True)
print(f"Audio directory created at: {output_dir}")

# Function to download audio in the best available format and convert to WAV
def download_and_convert_to_wav(url, output_name):
    try:
        print(f"Attempting to download audio from: {url}")
        ydl_opts = {
            'format': 'bestaudio/best',     # Download the best audio quality available
            'outtmpl': output_name + ".%(ext)s",  # Save with original extension
            'noplaylist': True              # Download only the single video
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(url, download=True)
            downloaded_file = ydl.prepare_filename(info_dict)  # Get the downloaded file name

        # Convert to WAV using FFmpeg via Pydub
        wav_path = f"{output_name}.wav"
        audio = AudioSegment.from_file(downloaded_file)  # Read the downloaded file
        audio.export(wav_path, format="wav")             # Convert to WAV
        print(f"Converted and saved as WAV: {wav_path}")

        # Clean up original downloaded file
        os.remove(downloaded_file)

    except Exception as e:
        print(f"Failed to download or convert {url}: {e}")

# Loop through each URL and download audio as WAV
for i, url in enumerate(urls):
    output_name = os.path.join(output_dir, f"audio_{i+1}")
    download_and_convert_to_wav(url, output_name)

print("Script completed.")

Loading CSV file...
URLs loaded from file:
0     https://www.youtube.com/watch?v=vQ_2x-lXaOY&li...
1     https://www.youtube.com/watch?v=ER13v7aSzN0&li...
2     https://www.youtube.com/watch?v=Z-j-mXXLIRk&li...
3     https://www.youtube.com/watch?v=FI8MKQ2RMiU&li...
4     https://www.youtube.com/watch?v=HdcfyH93TQE&li...
5     https://www.youtube.com/watch?v=T_RJUJXOdp8&li...
6     https://www.youtube.com/watch?v=cwIbUnSs4gc&li...
7     https://www.youtube.com/watch?v=QisNg2U_0Mw&li...
8     https://www.youtube.com/watch?v=3SO0A-W_SqM&li...
9     https://www.youtube.com/watch?v=8IDiAEIBqTE&li...
10    https://www.youtube.com/watch?v=NOOrUcAb-Fo&li...
11    https://www.youtube.com/watch?v=0Z-s8qo6hXg&li...
12    https://www.youtube.com/watch?v=UzXlVHtfXBQ&li...
13    https://www.youtube.com/watch?v=Drnob9EN0jg&li...
14    https://www.youtube.com/watch?v=d-WQraw9e94&li...
15    https://www.youtube.com/watch?v=-jbNtPc7NOw&li...
16    https://www.youtube.com/watch?v=LTYWLwmAfdk&li...
17   

## Convert Audio to Text (1 to 14)

In [9]:
import speech_recognition as sr
import logging
from pydub import AudioSegment
import os

# Set up logging
logging.basicConfig(level=logging.INFO)

def convert_to_pcm_mono(wav_file):
    """Convert audio file to mono PCM WAV format."""
    try:
        audio = AudioSegment.from_file(wav_file)
        audio = audio.set_channels(1)  # Convert to mono
        audio = audio.set_frame_rate(16000)  # Set sample rate to 16 kHz
        converted_file = "converted_audio_mono.wav"
        audio.export(converted_file, format="wav", codec="pcm_s16le")  # Ensure PCM format
        logging.info(f"Converted audio saved as: {converted_file}")
        return converted_file
    except Exception as e:
        logging.error(f"Error converting audio file: {e}")
        return None

def split_audio(wav_file, segment_length=60):
    """Split audio into segments of a specified length (in seconds)."""
    audio = AudioSegment.from_file(wav_file)
    segments = []
    for i in range(0, len(audio), segment_length * 1000):  # Convert to milliseconds
        segment = audio[i:i + segment_length * 1000]
        segment_file = f"segment_{i // 1000}.wav"
        segment.export(segment_file, format="wav", codec="pcm_s16le")
        segments.append(segment_file)
        logging.info(f"Exported segment: {segment_file} (Duration: {len(segment) / 1000:.2f} seconds)")
    return segments

def wav_to_text(wav_file):
    """Convert a WAV file to text using Google Speech Recognition."""
    r = sr.Recognizer()
    
    # Convert audio file to mono PCM format
    converted_file = convert_to_pcm_mono(wav_file)
    if converted_file is None:
        return None

    # Split audio into segments if longer than 60 seconds
    segments = split_audio(converted_file, segment_length=60)
    full_transcription = ""

    for segment in segments:
        with sr.AudioFile(segment) as source:
            audio_data = r.record(source)  # Read the entire audio file
            try:
                text = r.recognize_google(audio_data)
                full_transcription += text + " "
                logging.info(f"Transcribed segment: {segment}")
            except sr.UnknownValueError:
                logging.error("Could not understand audio in segment")
            except sr.RequestError as e:
                logging.error(f"Could not request results for {segment}; {e}")
                if "Bad Request" in str(e):
                    logging.error("Bad request. Check audio file and format.")
    
    return full_transcription.strip()

def process_audio_files(audio_files, output_file):
    """Process a list of audio files and write transcription to an output text file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        for wav_file in audio_files:
            if not os.path.isfile(wav_file):
                logging.error(f"File not found: {wav_file}")
                f.write(f"File not found: {wav_file}\n")
            else:
                transcription = wav_to_text(wav_file)
                if transcription:
                    logging.info(f"Transcription for {wav_file} completed.")
                    f.write(f"Transcription for {wav_file}:\n{transcription}\n\n")
                else:
                    logging.error(f"Transcription failed for {wav_file}.")
                    f.write(f"Transcription failed for {wav_file}.\n\n")

if __name__ == "__main__":
    # Example usage
    audio_files = [f'C:\\Users\\HP\\Python\\Assignment & Project\\output_audio_files\\audio_{i}.wav' for i in range(1, 15)]
    output_file = r'C:\Users\HP\Python\Assignment & Project\transcriptions.txt'
    
    process_audio_files(audio_files, output_file)

INFO:root:Converted audio saved as: converted_audio_mono.wav
INFO:root:Exported segment: segment_0.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_60.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_120.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_180.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_240.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_300.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_360.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_420.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_480.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_540.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_600.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_660.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_720.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segm

## Convert Audio to Text (15 to 30)

In [2]:
import speech_recognition as sr
import logging
from pydub import AudioSegment
import os

# Set up logging
logging.basicConfig(level=logging.INFO)

def convert_to_pcm_mono(wav_file):
    """Convert audio file to mono PCM WAV format."""
    try:
        audio = AudioSegment.from_file(wav_file)
        audio = audio.set_channels(1)  # Convert to mono
        audio = audio.set_frame_rate(16000)  # Set sample rate to 16 kHz
        converted_file = "converted_audio_mono.wav"
        audio.export(converted_file, format="wav", codec="pcm_s16le")  # Ensure PCM format
        logging.info(f"Converted audio saved as: {converted_file}")
        return converted_file
    except Exception as e:
        logging.error(f"Error converting audio file: {e}")
        return None

def split_audio(wav_file, segment_length=60):
    """Split audio into segments of a specified length (in seconds)."""
    audio = AudioSegment.from_file(wav_file)
    segments = []
    for i in range(0, len(audio), segment_length * 1000):  # Convert to milliseconds
        segment = audio[i:i + segment_length * 1000]
        segment_file = f"segment_{i // 1000}.wav"
        segment.export(segment_file, format="wav", codec="pcm_s16le")
        segments.append(segment_file)
        logging.info(f"Exported segment: {segment_file} (Duration: {len(segment) / 1000:.2f} seconds)")
    return segments

def wav_to_text(wav_file):
    """Convert a WAV file to text using Google Speech Recognition."""
    r = sr.Recognizer()
    
    # Convert audio file to mono PCM format
    converted_file = convert_to_pcm_mono(wav_file)
    if converted_file is None:
        return None

    # Split audio into segments if longer than 60 seconds
    segments = split_audio(converted_file, segment_length=60)
    full_transcription = ""

    for segment in segments:
        with sr.AudioFile(segment) as source:
            audio_data = r.record(source)  # Read the entire audio file
            try:
                text = r.recognize_google(audio_data)
                full_transcription += text + " "
                logging.info(f"Transcribed segment: {segment}")
            except sr.UnknownValueError:
                logging.error("Could not understand audio in segment")
            except sr.RequestError as e:
                logging.error(f"Could not request results for {segment}; {e}")
                if "Bad Request" in str(e):
                    logging.error("Bad request. Check audio file and format.")
    
    return full_transcription.strip()

def process_audio_files(audio_files, output_file):
    """Process a list of audio files and write transcription to an output text file."""
    with open(output_file, 'w', encoding='utf-8') as f:
        for wav_file in audio_files:
            if not os.path.isfile(wav_file):
                logging.error(f"File not found: {wav_file}")
                f.write(f"File not found: {wav_file}\n")
            else:
                transcription = wav_to_text(wav_file)
                if transcription:
                    logging.info(f"Transcription for {wav_file} completed.")
                    f.write(f"Transcription for {wav_file}:\n{transcription}\n\n")
                else:
                    logging.error(f"Transcription failed for {wav_file}.")
                    f.write(f"Transcription failed for {wav_file}.\n\n")

if __name__ == "__main__":
    # Example usage
    audio_files = [f'C:\\Users\\HP\\Python\\Assignment & Project\\output_audio_files\\audio_{i}.wav' for i in range(15, 31)]
    output_file = r'C:\Users\HP\Python\Assignment & Project\transcriptions1.txt'
    
    process_audio_files(audio_files, output_file)

INFO:root:Converted audio saved as: converted_audio_mono.wav
INFO:root:Exported segment: segment_0.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_60.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_120.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_180.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_240.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_300.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_360.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_420.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_480.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_540.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_600.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_660.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_720.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segm

## Final Transcriptions File 

In [10]:
# Specify the input file names with raw strings
file1 = r'C:\Users\HP\Python\Assignment & Project\transcriptions.txt'
file2 = r'C:\Users\HP\Python\Assignment & Project\transcriptions1.txt'

# Specify the output file name
output_file = r'C:\Users\HP\Python\Assignment & Project\Interview_Transcription.txt'

# Open the output file in write mode
with open(output_file, 'w') as outfile:
    # Read and write the contents of the first file
    with open(file1, 'r') as f1:
        outfile.write(f1.read())
        outfile.write("\n")  # Add a newline to separate content from both files

    # Read and write the contents of the second file
    with open(file2, 'r') as f2:
        outfile.write(f2.read())

print(f"Files {file1} and {file2} have been merged into {output_file}")

Files C:\Users\HP\Python\Assignment & Project\transcriptions.txt and C:\Users\HP\Python\Assignment & Project\transcriptions1.txt have been merged into C:\Users\HP\Python\Assignment & Project\Interview_Transcription.txt


## Split Transcriptions Files

In [11]:
import re
import os

# Input and output file paths
input_file = r'C:\Users\HP\Python\Assignment & Project\Interview_Transcription.txt'
output_folder = r'C:\Users\HP\Python\Assignment & Project\Interview_Split_Transcriptions_Files'

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the main transcription file
with open(input_file, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content based on each audio file's transcription using a regex pattern
# Assuming each transcription starts with "Transcription for {filepath}"
transcriptions = re.split(r'(Transcription for .+?\.wav:)', content)

# Loop through the split transcriptions and save each as a separate file
for i in range(1, len(transcriptions), 2):
    # File name and content
    file_name = transcriptions[i].strip().replace('Transcription for ', '').replace(':', '').replace('\\', '_').replace('/', '_')
    transcription_content = transcriptions[i] + transcriptions[i + 1]

    # Output path
    output_path = os.path.join(output_folder, f"{file_name}.txt")

    # Write each transcription to a new file
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(transcription_content)

print(f"Transcriptions have been saved to the '{output_folder}' folder.")

Transcriptions have been saved to the 'C:\Users\HP\Python\Assignment & Project\Interview_Split_Transcriptions_Files' folder.


In [17]:
import nltk
nltk.download('brown')
nltk.download('punkt')


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [28]:
import re

def read_interview_file(file_path):
    """Read the interview text file and return its content."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def parse_questions_answers(content):
    """Parse the content to segregate questions and answers."""
    # Split the content based on the question and answer format
    qa_pairs = re.findall(r'(Q:.*?)(?=Q:|$)', content, re.DOTALL)
    parsed_qa = []

    for qa in qa_pairs:
        question, answer = qa.split('A:', 1) if 'A:' in qa else (qa, "")
        parsed_qa.append({
            'question': question.strip(),
            'answer': answer.strip(),
            'tag': '',  # Placeholder for tagging
            'category': ''  # Placeholder for categorization
        })

    return parsed_qa

def categorize_answer(answer):
    """Categorize the answer as 'good' or 'bad' based on predefined criteria."""
    if len(answer) < 50:
        return 'bad'  # Example criterion: too short
    elif any(keyword in answer.lower() for keyword in ['excellent', 'great', 'good']):
        return 'good'
    else:
        return 'bad'

def tag_and_categorize_qa(parsed_qa):
    """Tag and categorize each question-answer pair."""
    for qa in parsed_qa:
        # Example tagging logic (you can expand this)
        qa['tag'] = "UPSC_Interview"

        # Categorize the answer
        qa['category'] = categorize_answer(qa['answer'])

def write_output_file(output_path, parsed_qa):
    """Write the tagged and categorized QA pairs to an output file."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for qa in parsed_qa:
            file.write(f"Question: {qa['question']}\n")
            file.write(f"Answer: {qa['answer']}\n")
            file.write(f"Tag: {qa['tag']}\n")
            file.write(f"Category: {qa['category']}\n")
            file.write("\n---\n\n")

def main():
    input_file_path = 'C:\\Users\\HP\\Python\\split_transcriptions\\C_Users_HP_Python_Assignment & Project_output_audio_files_audio_1.wav.txt'  # Path to the input file
    output_file_path = 'tagged_upsc_interview.txt'  # Path to the output file

    # Step 1: Read the interview file
    content = read_interview_file(input_file_path)

    # Step 2: Parse questions and answers
    parsed_qa = parse_questions_answers(content)

    # Step 3: Tag and categorize
    tag_and_categorize_qa(parsed_qa)

    # Step 4: Write output to a new file
    write_output_file(output_file_path, parsed_qa)

if __name__ == "__main__":
    main()

In [29]:
pip install SpeechRecognition pydub textblob


Note: you may need to restart the kernel to use updated packages.


In [32]:
import speech_recognition as sr
import logging
from pydub import AudioSegment
import os
from textblob import TextBlob

# Set up logging
logging.basicConfig(level=logging.INFO)

def convert_to_pcm_mono(wav_file):
    """Convert audio file to mono PCM WAV format."""
    try:
        audio = AudioSegment.from_file(wav_file)
        audio = audio.set_channels(1)  # Convert to mono
        audio = audio.set_frame_rate(16000)  # Set sample rate to 16 kHz
        converted_file = "converted_audio_mono.wav"
        audio.export(converted_file, format="wav", codec="pcm_s16le")  # Ensure PCM format
        logging.info(f"Converted audio saved as: {converted_file}")
        return converted_file
    except Exception as e:
        logging.error(f"Error converting audio file: {e}")
        return None

def split_audio(wav_file, segment_length=60):
    """Split audio into segments of a specified length (in seconds)."""
    audio = AudioSegment.from_file(wav_file)
    segments = []
    for i in range(0, len(audio), segment_length * 1000):  # Convert to milliseconds
        segment = audio[i:i + segment_length * 1000]
        segment_file = f"segment_{i // 1000}.wav"
        segment.export(segment_file, format="wav", codec="pcm_s16le")
        segments.append(segment_file)
        logging.info(f"Exported segment: {segment_file} (Duration: {len(segment) / 1000:.2f} seconds)")
    return segments

def wav_to_text(wav_file):
    """Convert a WAV file to text using Google Speech Recognition."""
    r = sr.Recognizer()
    
    # Convert audio file to mono PCM format
    converted_file = convert_to_pcm_mono(wav_file)
    if converted_file is None:
        return None

    # Split audio into segments if longer than 60 seconds
    segments = split_audio(converted_file, segment_length=60)
    full_transcription = ""

    for segment in segments:
        with sr.AudioFile(segment) as source:
            audio_data = r.record(source)  # Read the entire audio file
            try:
                text = r.recognize_google(audio_data)
                full_transcription += text + " "
                logging.info(f"Transcribed segment: {segment}")
            except sr.UnknownValueError:
                logging.error("Could not understand audio in segment")
            except sr.RequestError as e:
                logging.error(f"Could not request results for {segment}; {e}")
    
    return full_transcription.strip()

def tag_answer_quality(answer):
    """Tag answers based on advanced criteria for UPSC interview context."""
    tags = []

    # Content Relevance and Depth
    if len(answer.split()) > 50:
        tags.append("Content: Excellent")
    elif len(answer.split()) > 20:
        tags.append("Content: Good")
    else:
        tags.append("Content: Weak")

    # Confidence Level
    if any(phrase in answer.lower() for phrase in ["absolutely", "i firmly believe", "i am certain"]):
        tags.append("Confidence: High")
    elif any(phrase in answer.lower() for phrase in ["i think", "maybe", "perhaps", "i am not sure"]):
        tags.append("Confidence: Low")
    else:
        tags.append("Confidence: Moderate")

    # Structured Answer
    if any(phrase in answer.lower() for phrase in ["first", "second", "finally", "in conclusion", "to summarize"]):
        tags.append("Structure: Well-Structured")
    else:
        tags.append("Structure: Unstructured")

    # Emotional Intelligence
    if any(phrase in answer.lower() for phrase in ["i understand", "i empathize", "it is important to consider"]):
        tags.append("Emotional Intelligence: High")
    else:
        tags.append("Emotional Intelligence: Neutral/Low")

    # Tone Analysis using TextBlob
    analysis = TextBlob(answer)
    polarity = analysis.sentiment.polarity
    if polarity > 0.2:
        tags.append("Tone: Positive")
    elif polarity < -0.2:
        tags.append("Tone: Negative")
    else:
        tags.append("Tone: Neutral")

    # Conciseness
    if len(answer.split()) < 30:
        tags.append("Conciseness: Concise")
    else:
        tags.append("Conciseness: Verbose")

    return ", ".join(tags)

def process_audio_files(audio_files, output_file):
    """Process a list of audio files, transcribe them, and categorize the transcriptions."""
    with open(output_file, 'w', encoding='utf-8') as f:
        for wav_file in audio_files:
            if not os.path.isfile(wav_file):
                logging.error(f"File not found: {wav_file}")
                f.write(f"File not found: {wav_file}\n")
            else:
                transcription = wav_to_text(wav_file)
                if transcription:
                    logging.info(f"Transcription for {wav_file} completed.")
                    tags = tag_answer_quality(transcription)
                    f.write(f"Transcription for {wav_file}:\n{transcription}\n")
                    f.write(f"Tags: {tags}\n")
                    f.write("-" * 50 + "\n\n")
                else:
                    logging.error(f"Transcription failed for {wav_file}.")
                    f.write(f"Transcription failed for {wav_file}.\n\n")

if __name__ == "__main__":
    # Example usage
    # List of audio files - Make sure to provide them as a list, even if it's just one file
    audio_files = [
        r'C:\Users\HP\Python\Assignment & Project\output_audio_files\audio_1.wav'
    ]
    output_file = r'C:\Users\HP\Python\Assignment & Project\transcriptions1_tagged.txt'
    
    process_audio_files(audio_files, output_file)


INFO:root:Converted audio saved as: converted_audio_mono.wav
INFO:root:Exported segment: segment_0.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_60.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_120.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_180.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_240.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_300.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_360.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_420.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_480.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_540.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_600.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_660.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segment_720.wav (Duration: 60.00 seconds)
INFO:root:Exported segment: segm

In [33]:
pip install textblob


Note: you may need to restart the kernel to use updated packages.


In [37]:
import re
from textblob import TextBlob

# File paths
input_file = r'C:\Users\HP\Python\Assignment & Project\transcriptions1_tagged.txt'
output_file = r'C:\Users\HP\Python\Assignment & Project\categorized_interview_transcriptions.txt'

def tag_answer_quality(answer):
    """Tag answers based on advanced criteria for UPSC interview context."""
    tags = []

    # Content Relevance and Depth
    if len(answer.split()) > 50:
        tags.append("Content: Excellent")
    elif len(answer.split()) > 20:
        tags.append("Content: Good")
    else:
        tags.append("Content: Weak")

    # Confidence Level
    if any(phrase in answer.lower() for phrase in ["absolutely", "i firmly believe", "i am certain"]):
        tags.append("Confidence: High")
    elif any(phrase in answer.lower() for phrase in ["i think", "maybe", "perhaps", "i am not sure"]):
        tags.append("Confidence: Low")
    else:
        tags.append("Confidence: Moderate")

    # Structured Answer
    if any(phrase in answer.lower() for phrase in ["first", "second", "finally", "in conclusion", "to summarize"]):
        tags.append("Structure: Well-Structured")
    else:
        tags.append("Structure: Unstructured")

    # Emotional Intelligence
    if any(phrase in answer.lower() for phrase in ["i understand", "i empathize", "it is important to consider"]):
        tags.append("Emotional Intelligence: High")
    else:
        tags.append("Emotional Intelligence: Neutral/Low")

    # Tone Analysis using TextBlob
    analysis = TextBlob(answer)
    polarity = analysis.sentiment.polarity
    if polarity > 0.2:
        tags.append("Tone: Positive")
    elif polarity < -0.2:
        tags.append("Tone: Negative")
    else:
        tags.append("Tone: Neutral")

    # Conciseness
    if len(answer.split()) < 30:
        tags.append("Conciseness: Concise")
    else:
        tags.append("Conciseness: Verbose")

    return tags

def categorize_answer(tags):
    """Categorize the answer as Good or Bad based on tags."""
    if "Content: Weak" in tags or "Confidence: Low" in tags or "Tone: Negative" in tags:
        return "Bad"
    else:
        return "Good"

def process_transcription_file(input_file, output_file):
    """Process the transcription file to segregate, tag, and categorize answers."""
    try:
        # Read content from the input file
        with open(input_file, 'r', encoding='utf-8') as infile:
            content = infile.read()

        # Debug: Check if the content was read correctly
        if not content:
            print("The input file is empty or could not be read correctly.")
            return
        else:
            print("Content of input file successfully read.")
            print("------ Content Preview ------")
            print(content[:500])  # Print first 500 characters for inspection
            print("----------------------------")

        # Regular expressions to match questions and answers
        qa_pairs = re.split(r'\n-{20,}\n', content)  # Split by the separator lines with 20 or more dashes

        # Debug: Check the number of QA pairs found
        print(f"Number of QA pairs found: {len(qa_pairs)}")

        if len(qa_pairs) == 0:
            print("No question-answer pairs found. Please check the input format.")
            return

        with open(output_file, 'w', encoding='utf-8') as outfile:
            outfile.write("--- Categorized Interview Transcriptions ---\n\n")

            for idx, qa in enumerate(qa_pairs):
                qa = qa.strip()
                if not qa:
                    continue

                # Debug: Print the current question-answer pair being processed
                print(f"Processing QA pair {idx + 1}:")
                print(qa)
                print("-" * 50)

                # Extract question and answer using regex
                question_match = re.search(r'Question:\s*(.+)', qa, re.IGNORECASE)
                answer_match = re.search(r'Answer:\s*(.+)', qa, re.IGNORECASE)

                if question_match and answer_match:
                    question = question_match.group(1).strip()
                    answer = answer_match.group(1).strip()

                    # Debug: Ensure the question and answer are properly extracted
                    print(f"Extracted Question: {question}")
                    print(f"Extracted Answer: {answer}")

                    # Get advanced tags for the answer
                    tags = tag_answer_quality(answer)

                    # Categorize the answer
                    category = categorize_answer(tags)

                    # Write to output file
                    outfile.write(f"Question: {question}\n")
                    outfile.write(f"Answer: {answer}\n")
                    outfile.write(f"Tags: {', '.join(tags)}\n")
                    outfile.write(f"Category: {category}\n")
                    outfile.write("-" * 50 + "\n\n")

                    # Debugging output
                    print(f"Tags: {tags}")
                    print(f"Category: {category}")
                    print("-" * 50)
                else:
                    # Debug: Print if no valid Q&A found
                    print("No valid question and/or answer found in this segment. Skipping...")
                    print("-" * 50)

        print(f"Categorized transcription saved to '{output_file}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    process_transcription_file(input_file, output_file)


Content of input file successfully read.
------ Content Preview ------
Transcription for C:\Users\HP\Python\Assignment & Project\output_audio_files\audio_1.wav:
tell me why do you want to join the civil services for working the Qualcomm getting a very handsome salary why do you want to compromise on less salary in the Civil Services why do we have largest countries so tell me something about the city and also its social fabric can you tell us whether some relationships and adverse weather phenomena when was this Island territory needed to Sri Lanka yes coming good 
----------------------------
Number of QA pairs found: 2
Processing QA pair 1:
Transcription for C:\Users\HP\Python\Assignment & Project\output_audio_files\audio_1.wav:
tell me why do you want to join the civil services for working the Qualcomm getting a very handsome salary why do you want to compromise on less salary in the Civil Services why do we have largest countries so tell me something about the city and also its soc