# TF-IDF

In [8]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample slide content
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
processed_slides = []
for slide in slides:
    words = word_tokenize(slide.lower())
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words and word.isalnum()]
    processed_slides.append(' '.join(filtered_words))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_slides)

# Calculate importance scores for each slide
importance_scores = tfidf_matrix.sum(axis=1).flatten()

# Convert importance_scores to NumPy array
importance_scores = np.array(importance_scores)

# Assign slide numbers based on importance
sorted_indices = importance_scores.argsort()[::-1].tolist()[0]  # Extract inner list
slide_numbers = {slide: i+1 for i, slide in enumerate(sorted_indices)}

# Print slide numbers
for slide, number in slide_numbers.items():
    print(f"Slide '{slides[slide]}' has importance rank: {number}")

    

Slide 'Conclusion' has importance rank: 1
Slide 'Supervised Learning' has importance rank: 2
Slide 'Unsupervised Learning' has importance rank: 3
Slide 'Introduction to Machine Learning' has importance rank: 4
Slide 'Applications of Machine Learning' has importance rank: 5


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vaishnavikamisetti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaishnavikamisetti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NER


In [10]:
import spacy

# Load the smaller English NER model
nlp_sm = spacy.load("en_core_web_sm")

# Load the larger English NER model
nlp_lg = spacy.load("en_core_web_lg")

# Sample slides
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Function to extract entities using NER model
def extract_entities(nlp_model, text):
    doc = nlp_model(text)
    entities = [ent.text for ent in doc.ents]
    return entities

# Extract entities using both NER models
all_entities = []
for slide in slides:
    entities_sm = extract_entities(nlp_sm, slide)
    entities_lg = extract_entities(nlp_lg, slide)
    all_entities.extend(entities_sm)
    all_entities.extend(entities_lg)

# Remove duplicates
unique_entities = list(set(all_entities))

# Print entities for each slide
for slide in slides:
    entities_sm = extract_entities(nlp_sm, slide)
    entities_lg = extract_entities(nlp_lg, slide)
    combined_entities = list(set(entities_sm + entities_lg))
    print(f"Slide: {slide}")
    print(f"Entities: {combined_entities}\n")


Slide: Introduction to Machine Learning
Entities: []

Slide: Supervised Learning
Entities: []

Slide: Unsupervised Learning
Entities: ['Unsupervised Learning']

Slide: Applications of Machine Learning
Entities: ['Applications of Machine Learning']

Slide: Conclusion
Entities: []



# TextRank

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Sample slide content
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Tokenize and preprocess slide content
stop_words = set(stopwords.words('english'))
word_tokens = [word.lower() for slide in slides for word in word_tokenize(slide) if word.isalnum()]
filtered_words = [word for word in word_tokens if word not in stop_words]

# Calculate word frequency
word_freq = Counter(filtered_words)

# Calculate TextRank scores for each word
num_words = len(filtered_words)
word_scores = {word: word_freq[word] / num_words for word in word_freq}

# Extract keywords based on TextRank scores
num_keywords = 5  # Number of keywords to extract
keywords = [word for word, score in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:num_keywords]]

print("Keywords:", keywords)


Keywords: ['learning', 'machine', 'introduction', 'supervised', 'unsupervised']


# Rake

In [4]:
pip install rake-nltk


Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
from rake_nltk import Rake

# Sample slide content
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Initialize RAKE
r = Rake()

# Extract keywords from each slide
for slide in slides:
    r.extract_keywords_from_text(slide)
    keywords_with_scores = r.get_ranked_phrases_with_scores()
    print(f"Keywords for slide '{slide}': {keywords_with_scores}")


Keywords for slide 'Introduction to Machine Learning': [(4.0, 'machine learning'), (1.0, 'introduction')]
Keywords for slide 'Supervised Learning': [(4.0, 'supervised learning')]
Keywords for slide 'Unsupervised Learning': [(4.0, 'unsupervised learning')]
Keywords for slide 'Applications of Machine Learning': [(4.0, 'machine learning'), (1.0, 'applications')]
Keywords for slide 'Conclusion': [(1.0, 'conclusion')]


# Topic Modelling - LDA

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Sample slide content
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Convert slides to a document-term matrix
vectorizer = CountVectorizer(stop_words='english')
doc_term_matrix = vectorizer.fit_transform(slides)

# Define the number of topics
num_topics = 2

# Apply LDA
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(doc_term_matrix)

# Get topic-term matrix
topic_term_matrix = lda.components_

# Assign importance scores based on topics
topic_importance = np.sum(topic_term_matrix, axis=1)

# Assign slide numbers based on importance
sorted_indices = topic_importance.argsort()[::-1]
slide_numbers = {slide: i+1 for i, slide in enumerate(sorted_indices)}

# Print slide numbers
for slide, number in slide_numbers.items():
    print(f"Slide '{slides[slide]}' has importance rank: {number}")


Slide 'Introduction to Machine Learning' has importance rank: 1
Slide 'Supervised Learning' has importance rank: 2


# Topic Modelling - NMF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Sample slide content
slides = [
    "Introduction to Machine Learning",
    "Supervised Learning",
    "Unsupervised Learning",
    "Applications of Machine Learning",
    "Conclusion"
]

# Convert slides to TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(slides)

# Define the number of topics
num_topics = 2

# Apply NMF
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(tfidf_matrix)

# Get topic-term matrix
topic_term_matrix = nmf.components_

# Assign importance scores based on topics
topic_importance = np.sum(topic_term_matrix, axis=1)

# Assign slide numbers based on importance
sorted_indices = topic_importance.argsort()[::-1]
slide_numbers = {slide: i+1 for i, slide in enumerate(sorted_indices)}

# Print slide numbers
for slide, number in slide_numbers.items():
    print(f"Slide '{slides[slide]}' has importance rank: {number}")


Slide 'Introduction to Machine Learning' has importance rank: 1
Slide 'Supervised Learning' has importance rank: 2


# KeyWords

In [7]:
# Sample slide content
slides = [
    "Introduction to Machine Learning: This slide provides an overview of machine learning concepts.",
    "Supervised Learning: This slide discusses the principles of supervised learning algorithms.",
    "Algorithm: Here, we delve into the details of the K-nearest neighbors algorithm.",
    "Applications: Various applications of machine learning in real-world scenarios are covered in this slide.",
    "Conclusion: A summary of key points and findings is presented in this final slide."
]

# List of keywords
keywords = [
    "Equation", "Formula", "Calculation", "Derivation", "Theorem", "Law", "Principle", "Axiom", "Postulate",
    "Application", "Example", "Illustration", "Use case", "Method", "Procedure", "Algorithm", "Definition",
    "Explanation", "Overview", "Introduction", "Analysis", "Comparison", "Limitation", "Challenge",
    "Result", "Conclusion", "Observation", "Summary"
]

# Function to calculate importance of each slide
def calculate_importance(slide_content):
    importance_score = 0
    for keyword in keywords:
        if keyword.lower() in slide_content.lower():
            importance_score += 1
    return importance_score

# Calculate importance scores for each slide
slide_importance = [calculate_importance(slide) for slide in slides]

# Print importance scores for each slide
for i, importance in enumerate(slide_importance, 1):
    print(f"Importance of Slide {i}: {importance}")


Importance of Slide 1: 2
Importance of Slide 2: 2
Importance of Slide 3: 1
Importance of Slide 4: 1
Importance of Slide 5: 2


In [11]:
pip install python-pptx


Collecting python-pptx
  Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: python-pptx
Successfully installed python-pptx-0.6.23
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import pptx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Function to extract text from PowerPoint slides
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

# Function to calculate importance of each slide using LDA
def calculate_importance_lda(slides):
    vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(slides)
    lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
    lda_output = lda_model.fit_transform(X)
    importance_scores = np.sum(lda_output, axis=1)
    return importance_scores

# Function to calculate importance of each slide using manual keywords
def calculate_importance_keywords(slides, keywords):
    importance_scores = []
    for slide in slides:
        score = sum(1 for keyword in keywords if keyword.lower() in slide.lower())
        importance_scores.append(score)
    return importance_scores

# Path to the PowerPoint file
pptx_path = "/Users/vaishnavikamisetti/Desktop/NLP.pptx"

# List of keywords
keywords = [
    "Equation", "Formula", "Calculation", "Derivation", "Theorem",
    "Law","Principle", "Axiom", "Postulate",
    "Application", "Example", "Illustration", "Use case", "Method",
    "Procedure", "Algorithm", "Definition",
    "Explanation", "Overview", "Introduction", "Analysis", 
    "Comparison", "Limitation", "Challenge",
    "Result", "Conclusion", "Observation", "Summary"
]

# Extract text from PowerPoint slides
slides_text = extract_text_from_pptx(pptx_path)

# Calculate importance scores for each slide using LDA
importance_scores_lda = calculate_importance_lda(slides_text)

# Calculate importance scores for each slide using manual keywords
importance_scores_keywords = calculate_importance_keywords(slides_text, keywords)

# Combine importance scores from both approaches
hybrid_importance_scores = np.add(importance_scores_lda, importance_scores_keywords)

# Print hybrid importance scores for each slide
for i, importance in enumerate(hybrid_importance_scores, 1):
    print(f"Hybrid Importance of Slide {i}: {importance}")


Hybrid Importance of Slide 1: 1.0
Hybrid Importance of Slide 2: 2.0
Hybrid Importance of Slide 3: 2.0
Hybrid Importance of Slide 4: 1.0000000000000002
Hybrid Importance of Slide 5: 1.0
Hybrid Importance of Slide 6: 1.0
Hybrid Importance of Slide 7: 1.0
Hybrid Importance of Slide 8: 1.0
Hybrid Importance of Slide 9: 1.0
Hybrid Importance of Slide 10: 1.0
Hybrid Importance of Slide 11: 1.0
Hybrid Importance of Slide 12: 1.0
Hybrid Importance of Slide 13: 2.0
Hybrid Importance of Slide 14: 0.9999999999999999
Hybrid Importance of Slide 15: 1.0
Hybrid Importance of Slide 16: 1.0
Hybrid Importance of Slide 17: 1.0
Hybrid Importance of Slide 18: 0.9999999999999999
Hybrid Importance of Slide 19: 1.0
Hybrid Importance of Slide 20: 1.0
Hybrid Importance of Slide 21: 2.0
Hybrid Importance of Slide 22: 1.0
Hybrid Importance of Slide 23: 1.0
Hybrid Importance of Slide 24: 0.9999999999999999
Hybrid Importance of Slide 25: 1.0
Hybrid Importance of Slide 26: 1.0
Hybrid Importance of Slide 27: 1.0
Hybr

In [1]:
import os
import pptx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Function to extract text from PowerPoint slides
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

# Function to calculate importance of each slide using LDA
def calculate_importance_lda(slides):
    vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(slides)
    lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
    lda_output = lda_model.fit_transform(X)
    importance_scores = np.sum(lda_output, axis=1)
    return importance_scores

# Function to analyze the structural elements and calculate importance scores
def calculate_importance_structure(slides):
    importance_scores = []
    for slide in slides:
        # Analyze structural elements related to the topic
        num_headings = slide.count("\n\n") if "topic" in slide.lower() else 0  # Assuming two consecutive newlines indicate a heading
        num_bullet_points = slide.count("\n•") if "topic" in slide.lower() else 0  # Counting bullet points
        
        # Calculate importance score based on structural elements
        score = num_headings + num_bullet_points
        importance_scores.append(score)
    return importance_scores

# Manual defined keywords
keywords = [
    "Equation", "Formula", "Calculation", "Derivation", "Theorem", "Law", "Principle", "Axiom", "Postulate",
    "Application", "Example", "Illustration", "Use case", "Method", "Procedure", "Algorithm", "Definition",
    "Explanation", "Overview", "Introduction", "Analysis", "Comparison", "Limitation", "Challenge",
    "Result", "Conclusion", "Observation", "Summary"
]

# Path to the PowerPoint file
pptx_path = "/Users/vaishnavikamisetti/Desktop/Big_data.pptx"

# Extract text from PowerPoint slides
slides_text = extract_text_from_pptx(pptx_path)

# Calculate importance scores for each slide using LDA
importance_scores_lda = calculate_importance_lda(slides_text)

# Calculate importance scores for each slide based on structural elements
importance_scores_structure = calculate_importance_structure(slides_text)

# Calculate importance scores based on manual defined keywords
importance_scores_keywords = []
for slide in slides_text:
    score = sum(1 for keyword in keywords if keyword.lower() in slide.lower())
    importance_scores_keywords.append(score)

# Combine importance scores from LDA, structural analysis, and keywords
hybrid_importance_scores = np.add(np.add(importance_scores_lda, importance_scores_structure), importance_scores_keywords)

# Print hybrid importance scores for each slide
for i, importance in enumerate(hybrid_importance_scores, 1):
    print(f"Hybrid Importance of Slide {i}: {importance}")


Hybrid Importance of Slide 1: 1.0
Hybrid Importance of Slide 2: 1.0
Hybrid Importance of Slide 3: 1.0
Hybrid Importance of Slide 4: 0.9999999999999999
Hybrid Importance of Slide 5: 2.0
Hybrid Importance of Slide 6: 1.0
Hybrid Importance of Slide 7: 2.0
Hybrid Importance of Slide 8: 1.0
Hybrid Importance of Slide 9: 2.0
Hybrid Importance of Slide 10: 1.0
Hybrid Importance of Slide 11: 1.0
Hybrid Importance of Slide 12: 1.0
Hybrid Importance of Slide 13: 2.0
Hybrid Importance of Slide 14: 3.0
Hybrid Importance of Slide 15: 2.0
Hybrid Importance of Slide 16: 1.0
Hybrid Importance of Slide 17: 1.0
Hybrid Importance of Slide 18: 1.0
Hybrid Importance of Slide 19: 2.0
Hybrid Importance of Slide 20: 3.0
