In [None]:
import os
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from PyPDF2 import PdfReader
import pdfplumber


class PitchAnalysisModel:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def extract_text_from_pdf(self, pdf_path):
        text = ''
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() + ' '
            if text.strip():
                return text
        except Exception as e:
            print(f"pdfplumber failed to extract text: {e}. Falling back to PyPDF2.")
        
        try:
            reader = PdfReader(pdf_path)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() + ' '
        except Exception as e:
            raise ValueError(f"Both pdfplumber and PyPDF2 failed to extract text: {e}")
        return text

    def preprocess_text(self, text):
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[\r\n]+', ' ', text)
        text = text.strip()
        return text

    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        scores = torch.nn.functional.softmax(outputs.logits, dim=1)
        return scores

    def analyze_pitch(self, text):
        sections = {
            "Problem": "Problem",
            "Solution": "Solution",
            "Market": "Market",
            "Business Model": "Business Model",
            "Financials": "Financials",
            "Team": "Team"
        }

        feedback = {}
        for section_name, pattern in sections.items():
            section_text = " ".join(re.findall(f'(?i)\b{pattern}\b.*', text))
            section_text = self.preprocess_text(section_text)

            if section_text:
                scores = self.predict(section_text)
                score = scores[0][1].item() * 100
                feedback[section_name] = {
                    "score": score,
                    "feedback": f"The {section_name} section is evaluated with a score of {score:.2f}."
                }
            else:
                feedback[section_name] = {
                    "score": 0.0,
                    "feedback": f"The {section_name} section is missing or insufficient."
                }
        return feedback


if __name__ == "__main__":
    model = PitchAnalysisModel()

    pdf_path = "Sampledata.pdf"
    extracted_text = model.extract_text_from_pdf(pdf_path)
    print(f"\nExtracted Text from PDF:\n {extracted_text}\n")

    analysis_result = model.analyze_pitch(extracted_text)

    total_score = sum([section['score'] for section in analysis_result.values()]) / len(analysis_result)

    print("\nPitch Analysis Results:")
    for section, result in analysis_result.items():
        print(f"{section}: {result['feedback']}")

    print(f"\nOverall Pitch Score: {total_score:.2f}")
