<a href="https://colab.research.google.com/github/Ragavi203/AI-Powered-PDF-Document-Summarizer-and-Analyzer/blob/main/AI_Powered_PDF_Document_Summarizer_and_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install PyPDF2 transformers nltk spacy

import PyPDF2
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize
import spacy
import textwrap
import json
from google.colab import files

# Download required NLTK data
nltk.download('punkt')

class EnhancedPDFAnalyzer:
    def __init__(self):
        # Initialize AI models
        self.bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        self.nlp = spacy.load("en_core_web_sm")
        self.sentiment_analyzer = pipeline("sentiment-analysis")

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from uploaded PDF file"""
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        print(f"Processing {len(reader.pages)} pages...")
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text

    def generate_summary(self, text):
        """Generate summary using BART model"""
        chunks = [text[i:i+1024] for i in range(0, len(text), 1024)]
        summaries = []

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)}...")
            try:
                summary = self.bart_summarizer(chunk, max_length=130, min_length=30, do_sample=False)
                summaries.append(summary[0]['summary_text'])
            except Exception as e:
                print(f"Warning: Summarization failed for chunk {i+1}: {str(e)}")
                continue

        return " ".join(summaries)

    def extract_key_insights(self, text):
        """Extract key insights using rule-based processing"""
        doc = self.nlp(text)
        insights = [sent.text for sent in doc.sents if len(sent.text) > 20][:5]
        return insights

    def analyze_sentiment(self, text):
        """Analyze sentiment of the text"""
        sentences = sent_tokenize(text[:10000])  # Analyze first 10000 chars for sentiment
        sentiments = self.sentiment_analyzer(sentences)
        overall_sentiment = max(set(s['label'] for s in sentiments),
                              key=lambda x: sum(1 for s in sentiments if s['label'] == x))
        return overall_sentiment

    def analyze_document(self, text):
        """Perform comprehensive document analysis"""
        print("Generating summary...")
        summary = self.generate_summary(text)

        print("Extracting key insights...")
        key_insights = self.extract_key_insights(text)

        print("Analyzing sentiment...")
        sentiment = self.analyze_sentiment(text)

        return {
            'summary': summary,
            'key_insights': key_insights,
            'sentiment': sentiment
        }

def main():
    print("Welcome to Enhanced PDF Analyzer!")

    analyzer = EnhancedPDFAnalyzer()

    print("\nPlease upload your PDF file...")
    uploaded = files.upload()

    for filename in uploaded.keys():
        try:
            print(f"\nAnalyzing {filename}...")

            # Extract text
            text = analyzer.extract_text_from_pdf(filename)
            print(f"Extracted {len(text.split())} words from PDF")

            # Analyze document
            results = analyzer.analyze_document(text)

            # Print results
            print("\n" + "="*80)
            print("DOCUMENT ANALYSIS RESULTS")
            print("="*80)

            print("\nSUMMARY:")
            print(textwrap.fill(results['summary'], width=80))

            print("\nKEY INSIGHTS:")
            for insight in results['key_insights']:
                print(f"- {insight}")

            print("\nOVERALL SENTIMENT:")
            print(results['sentiment'])

            # Save results to file
            with open('analysis_results.json', 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2)
            print("\nFull analysis has been saved to 'analysis_results.json'")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    main()
