In [3]:
import PyPDF2
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download ALL required NLTK resources
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('punkt_tab')  # This is the missing resource!
except:
    print("NLTK downloads completed or already present")

class ResearchPaperThemeExtractor:
    def __init__(self):
        self.papers_text = []
        self.paper_titles = []
        self.cleaned_texts = []
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        
        # Add academic-specific stop words
        academic_stopwords = {
            'paper', 'study', 'research', 'method', 'result', 'conclusion',
            'introduction', 'abstract', 'section', 'figure', 'table',
            'reference', 'citation', 'author', 'journal', 'proceedings',
            'however', 'therefore', 'moreover', 'furthermore', 'namely',
            'et', 'al', 'etc', 'ie', 'eg', 'cf', 'vol', 'pp', 'no'
        }
        self.stop_words.update(academic_stopwords)
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF file with better error handling"""
        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "
                
                # Check if we got meaningful text
                if len(text.strip()) < 50:  # If very little text extracted
                    print(f"Warning: Very little text extracted from {os.path.basename(pdf_path)}")
                    return None
                
                return text.strip()
                
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")
            return None
    
    def load_papers(self, folder_path):
        """Load all PDF papers from a folder with better filtering"""
        try:
            all_files = os.listdir(folder_path)
            pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]
            
            print(f"Found {len(pdf_files)} PDF files in folder")
            
            for pdf_file in pdf_files:
                pdf_path = os.path.join(folder_path, pdf_file)
                print(f"Processing: {pdf_file}...", end=" ")
                
                text = self.extract_text_from_pdf(pdf_path)
                if text and len(text) > 100:  # Only add if substantial text extracted
                    self.papers_text.append(text)
                    self.paper_titles.append(pdf_file)
                    print("✓ Loaded successfully")
                else:
                    print("✗ Failed to load (empty or corrupted)")
        
            print(f"\nSuccessfully loaded: {len(self.papers_text)} out of {len(pdf_files)} papers")
            
        except Exception as e:
            print(f"Error loading papers: {e}")
    
    def simple_tokenize(self, text):
        """Simple tokenization as fallback if NLTK fails"""
        # Basic tokenization without NLTK
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
        return words
    
    def preprocess_text(self, text):
        """Clean and preprocess text with fallback tokenization"""
        try:
            # Remove citations and references
            text = re.sub(r'\[\d+\]', '', text)
            text = re.sub(r'\(\w+ et al\.?,?\s*\d{4}\)', '', text)
            
            # Remove special characters and digits, keep only letters
            text = re.sub(r'[^a-zA-Z\s]', ' ', text)
            text = re.sub(r'\s+', ' ', text).strip()
            
            # Try NLTK tokenization first, fallback to simple tokenization
            try:
                tokens = word_tokenize(text.lower())
            except:
                tokens = self.simple_tokenize(text)
            
            # Remove stopwords and short tokens, lemmatize
            processed_tokens = []
            for token in tokens:
                if token not in self.stop_words and len(token) > 2:
                    try:
                        processed_tokens.append(self.lemmatizer.lemmatize(token))
                    except:
                        processed_tokens.append(token)  # Fallback to original token
            
            return ' '.join(processed_tokens)
            
        except Exception as e:
            print(f"Error in preprocessing: {e}")
            return ""
    
    def preprocess_all_papers(self):
        """Preprocess all loaded papers"""
        print("Preprocessing papers...")
        self.cleaned_texts = []
        for i, text in enumerate(self.papers_text):
            cleaned = self.preprocess_text(text)
            if cleaned and len(cleaned) > 10:  # Only keep if we have meaningful content
                self.cleaned_texts.append(cleaned)
            else:
                print(f"Paper {i+1} resulted in empty text after preprocessing")
        
        print(f"Preprocessing completed! {len(self.cleaned_texts)} papers ready for analysis.")
    
    def extract_keywords_tfidf(self, top_n=20):
        """Extract keywords using TF-IDF"""
        if not self.cleaned_texts:
            print("No cleaned texts available for analysis")
            return [], None, None
            
        try:
            vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
            tfidf_matrix = vectorizer.fit_transform(self.cleaned_texts)
            
            feature_names = vectorizer.get_feature_names_out()
            
            # Get overall important terms
            overall_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
            top_indices = overall_tfidf.argsort()[-top_n:][::-1]
            top_keywords = [(feature_names[i], overall_tfidf[i]) for i in top_indices]
            
            return top_keywords, tfidf_matrix, vectorizer
            
        except Exception as e:
            print(f"Error in TF-IDF analysis: {e}")
            return [], None, None
    
    def topic_modeling_lda(self, num_topics=5):
        """Perform LDA topic modeling"""
        if not self.cleaned_texts:
            return [], None, None
            
        try:
            # Create document-term matrix
            vectorizer = CountVectorizer(max_features=1000, ngram_range=(1, 2))
            doc_term_matrix = vectorizer.fit_transform(self.cleaned_texts)
            
            # Apply LDA
            lda = LatentDirichletAllocation(
                n_components=num_topics, 
                random_state=42,
                max_iter=10
            )
            lda.fit(doc_term_matrix)
            
            # Extract topics
            feature_names = vectorizer.get_feature_names_out()
            topics = []
            
            for topic_idx, topic in enumerate(lda.components_):
                top_features_ind = topic.argsort()[-10:][::-1]
                top_features = [feature_names[i] for i in top_features_ind]
                topics.append((f"Topic {topic_idx + 1}", top_features))
            
            return topics, lda, doc_term_matrix
            
        except Exception as e:
            print(f"Error in LDA analysis: {e}")
            return [], None, None
    
    def topic_modeling_nmf(self, num_topics=5):
        """Perform NMF topic modeling"""
        if not self.cleaned_texts:
            return []
            
        try:
            tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
            tfidf_matrix = tfidf_vectorizer.fit_transform(self.cleaned_texts)
            
            nmf = NMF(n_components=num_topics, random_state=42)
            nmf.fit(tfidf_matrix)
            
            feature_names = tfidf_vectorizer.get_feature_names_out()
            topics = []
            
            for topic_idx, topic in enumerate(nmf.components_):
                top_features_ind = topic.argsort()[-10:][::-1]
                top_features = [feature_names[i] for i in top_features_ind]
                topics.append((f"Topic {topic_idx + 1}", top_features))
            
            return topics
            
        except Exception as e:
            print(f"Error in NMF analysis: {e}")
            return []
    
    def generate_word_cloud(self):
        """Generate word cloud from all papers"""
        if not self.cleaned_texts:
            print("No data available for word cloud")
            return
            
        all_text = ' '.join(self.cleaned_texts)
        
        if len(all_text.strip()) < 10:
            print("Not enough text for word cloud")
            return
            
        try:
            wordcloud = WordCloud(
                width=800, 
                height=400, 
                background_color='white',
                max_words=100
            ).generate(all_text)
            
            plt.figure(figsize=(12, 6))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title('Word Cloud of Research Papers Themes')
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"Error generating word cloud: {e}")
    
    def plot_top_keywords(self, top_keywords):
        """Plot top keywords"""
        if not top_keywords:
            print("No keywords to plot")
            return
            
        try:
            keywords, scores = zip(*top_keywords[:15])
            
            plt.figure(figsize=(12, 6))
            plt.barh(keywords, scores)
            plt.xlabel('TF-IDF Score')
            plt.title('Top 15 Keywords Across All Papers')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"Error plotting keywords: {e}")
    
    def analyze_themes(self):
        """Comprehensive theme analysis"""
        print("=" * 60)
        print("RESEARCH PAPER THEME ANALYSIS")
        print("=" * 60)
        
        if not self.papers_text:
            print("No papers to analyze!")
            return None
        
        # Preprocess papers
        self.preprocess_all_papers()
        
        if not self.cleaned_texts:
            print("No papers could be processed for analysis!")
            return None
        
        results = {}
        
        # Extract keywords
        print("\n1. TOP KEYWORDS ANALYSIS")
        print("-" * 30)
        top_keywords, tfidf_matrix, vectorizer = self.extract_keywords_tfidf()
        if top_keywords:
            for keyword, score in top_keywords[:15]:
                print(f"{keyword}: {score:.4f}")
            results['keywords'] = top_keywords
        else:
            print("No keywords extracted")
            results['keywords'] = []
        
        # Topic modeling with LDA
        print("\n2. LDA TOPIC MODELING")
        print("-" * 30)
        lda_topics, lda_model, doc_term_matrix = self.topic_modeling_lda()
        if lda_topics:
            for topic_name, keywords in lda_topics:
                print(f"{topic_name}: {', '.join(keywords[:8])}")
            results['lda_topics'] = lda_topics
        else:
            print("No LDA topics extracted")
            results['lda_topics'] = []
        
        # Topic modeling with NMF
        print("\n3. NMF TOPIC MODELING")
        print("-" * 30)
        nmf_topics = self.topic_modeling_nmf()
        if nmf_topics:
            for topic_name, keywords in nmf_topics:
                print(f"{topic_name}: {', '.join(keywords[:8])}")
            results['nmf_topics'] = nmf_topics
        else:
            print("No NMF topics extracted")
            results['nmf_topics'] = []
        
        # Generate visualizations
        print("\n4. GENERATING VISUALIZATIONS...")
        if top_keywords:
            self.plot_top_keywords(top_keywords)
        self.generate_word_cloud()
        
        return results

# MAIN EXECUTION
def main():
    # Initialize the theme extractor
    extractor = ResearchPaperThemeExtractor()
    
    # Use your path - FIXED with raw string
    papers_folder = r"C:\Users\mahac\Desktop\pdf_files"
    
    print(f"Looking for papers in: {papers_folder}")
    
    # Check if folder exists
    if not os.path.exists(papers_folder):
        print(f"Error: Folder '{papers_folder}' does not exist!")
        print("Please check the path and try again.")
        return
    
    # Load papers
    extractor.load_papers(papers_folder)
    
    # Perform analysis if papers were loaded
    if extractor.papers_text:
        results = extractor.analyze_themes()
        
        if results:
            # Print summary
            print("\n" + "=" * 60)
            print("ANALYSIS SUMMARY")
            print("=" * 60)
            print(f"Total papers analyzed: {len(extractor.papers_text)}")
            print(f"Papers successfully processed: {len(extractor.cleaned_texts)}")
            print(f"Main themes identified: {len(results.get('lda_topics', []))}")
            print(f"Top keywords found: {len(results.get('keywords', []))}")
    else:
        print("No papers were loaded. Please check:")
        print("1. The folder contains PDF files")
        print("2. PDF files are not password protected")
        print("3. PDF files contain extractable text (not scanned images)")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'wordcloud'