In [12]:
pip install pypdf2

Note: you may need to restart the kernel to use updated packages.


In [13]:
import re
import nltk
import PyPDF2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # تحويل إلى أحرف صغيرة
    text = text.lower()
    # إزالة علامات الترقيم
    text = re.sub(r'[^\w\s]', '', text)
    # تقسيم النص إلى كلمات
    tokens = word_tokenize(text)
    # إزالة كلمات التوقف
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + " "
    return text.strip()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import math
from collections import Counter

def compute_tf(document_tokens):
    word_counts = Counter(document_tokens)
    total_words = len(document_tokens)
    tf = {word: count / total_words for word, count in word_counts.items()}
    return tf

def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    
    for word in all_words:
        df = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N / df)
    
    return idf


In [15]:
def compute_tfidf(documents):
    tfidf_scores = []
    idf = compute_idf(documents)
    
    for doc in documents:
        tf = compute_tf(doc)
        tfidf = {word: tf[word] * idf[word] for word in tf}
        tfidf_scores.append(tfidf)
    
    return tfidf_scores

def get_top_words(tfidf_scores, top_n=5):
    top_words_per_doc = []
    
    for tfidf in tfidf_scores:
        sorted_words = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
        top_words_per_doc.append(sorted_words[:top_n])
    
    return top_words_per_doc


In [23]:
pip install reportlab

Note: you may need to restart the kernel to use updated packages.


In [28]:
from reportlab.pdfgen import canvas

def create_sample_pdf(file_name, text):
    c = canvas.Canvas(file_name)
    c.drawString(100, 750, text)
    c.save()

# إنشاء 4 ملفات PDF تجريبية
texts = [
    "This is a sample document about artificial intelligence and deep learning.",
    "Natural language processing is a key field in AI research and development.",
    "Machine learning algorithms are improving every year with new techniques.",
    "Supervised and unsupervised learning are two main types of machine learning."
]

for i in range(4):
    create_sample_pdf(f"doc{i+1}.pdf", texts[i])

print("Sample PDFs created successfully!")


Sample PDFs created successfully!


In [30]:
pdf_files = ["doc1.pdf", "doc2.pdf", "doc3.pdf", "doc4.pdf"]
documents = [preprocess_text(extract_text_from_pdf(pdf)) for pdf in pdf_files]

tfidf_scores = compute_tfidf(documents)
top_words = get_top_words(tfidf_scores, top_n=5)

for i, words in enumerate(top_words):
    print(f"Top words in document {i+1}: {words}")


Top words in document 1: [('sample', 0.23104906018664842), ('document', 0.23104906018664842), ('artificial', 0.23104906018664842), ('intelligence', 0.23104906018664842), ('deep', 0.23104906018664842)]
Top words in document 2: [('natural', 0.17328679513998632), ('language', 0.17328679513998632), ('processing', 0.17328679513998632), ('key', 0.17328679513998632), ('field', 0.17328679513998632)]
Top words in document 3: [('algorithms', 0.17328679513998632), ('improving', 0.17328679513998632), ('every', 0.17328679513998632), ('year', 0.17328679513998632), ('new', 0.17328679513998632)]
Top words in document 4: [('supervised', 0.17328679513998632), ('unsupervised', 0.17328679513998632), ('two', 0.17328679513998632), ('main', 0.17328679513998632), ('types', 0.17328679513998632)]
