In [3]:
import os
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

# Function to read text from files in a folder
def read_files_from_folder(folder_path, encoding='utf-8'):
    texts = []
    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding=encoding, errors='ignore') as file:
            text = file.read()
            texts.append(text)
    return texts

# Path to the folder containing lecture slides
slides_folder_path = "/Users/vaishnavikamisetti/Desktop/NLPP"

# Function to train Word2Vec model
def train_word2vec_model(texts):
    # Tokenize the text
    tokenized_corpus = [word_tokenize(text.lower()) for text in texts]

    # Train Word2Vec model
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=1)

    return model

# Read lecture slides from the folder
slides_texts = read_files_from_folder(slides_folder_path)

# Train Word2Vec model
word2vec_model = train_word2vec_model(slides_texts)

# Save Word2Vec model
word2vec_model.save("word2vec_model.bin")


In [4]:
import os
import pptx
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Function to extract text from PowerPoint slides
def extract_text_from_pptx(pptx_file):
    text = []
    prs = pptx.Presentation(pptx_file)
    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text.append("\n".join(slide_text))
    return text

# Function to calculate importance of each slide using LDA
def calculate_importance_lda(slides):
    vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(slides)
    lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
    lda_output = lda_model.fit_transform(X)
    importance_scores = np.sum(lda_output, axis=1)
    return importance_scores

# Function to analyze the structural elements and calculate importance scores
def calculate_importance_structure(slides):
    importance_scores = []
    for slide in slides:
        # Analyze structural elements related to the topic
        num_headings = slide.count("\n\n") if "topic" in slide.lower() else 0  # Assuming two consecutive newlines indicate a heading
        num_bullet_points = slide.count("\n•") if "topic" in slide.lower() else 0  # Counting bullet points
        
        # Calculate importance score based on structural elements
        score = num_headings + num_bullet_points
        importance_scores.append(score)
    return importance_scores

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text)
    processed_tokens = [token.lower() for token in tokens if token.isalnum()]
    return processed_tokens

# Function to calculate semantic similarity between a slide and important concepts
def calculate_semantic_similarity(slide, word2vec_model):
    slide_tokens = preprocess_text(slide)
    concept_tokens = list(word2vec_model.wv.key_to_index.keys())
    
    # Check if either slide_tokens or concept_tokens is empty
    if not slide_tokens or not concept_tokens:
        return 0.0  # Return a default value when one of the lists is empty
    
    # Calculate semantic similarity between slide and all concepts
    similarity_scores = [word2vec_model.wv.n_similarity(slide_tokens, [concept]) for concept in concept_tokens]
    avg_similarity_score = np.mean(similarity_scores)
    return avg_similarity_score




# Load Word2Vec model
word2vec_model_path = "word2vec_model.bin"
word2vec_model = Word2Vec.load(word2vec_model_path)

# Manual defined keywords
keywords = [
    "Equation", "Formula", "Calculation", "Derivation", "Theorem", "Law", "Principle", "Axiom", "Postulate",
    "Application", "Example", "Illustration", "Use case", "Method", "Procedure", "Algorithm", "Definition",
    "Explanation", "Overview", "Introduction", "Analysis", "Comparison", "Limitation", "Challenge",
    "Result", "Conclusion", "Observation", "Summary"
]

# Function to calculate importance scores based on semantic analysis
def calculate_importance_semantic(slides, word2vec_model):
    importance_scores_semantic = []
    for slide in slides:
        similarity_score = calculate_semantic_similarity(slide, word2vec_model)
        importance_scores_semantic.append(similarity_score)
    return importance_scores_semantic

# Path to the PowerPoint file
pptx_path = "/Users/vaishnavikamisetti/Desktop/NLP.pptx"

# Extract text from PowerPoint slides
slides_text = extract_text_from_pptx(pptx_path)

# Calculate importance scores for each slide using LDA
importance_scores_lda = calculate_importance_lda(slides_text)

# Calculate importance scores for each slide based on structural elements
importance_scores_structure = calculate_importance_structure(slides_text)

# Calculate importance scores based on manual defined keywords
importance_scores_keywords = []
for slide in slides_text:
    score = sum(1 for keyword in keywords if keyword.lower() in slide.lower())
    importance_scores_keywords.append(score)

# Calculate importance scores based on semantic analysis
importance_scores_semantic = calculate_importance_semantic(slides_text, word2vec_model)

# Combine importance scores from all methods
hybrid_importance_scores = np.add(np.add(np.add(importance_scores_lda, importance_scores_structure), importance_scores_keywords), importance_scores_semantic)

# Print hybrid importance scores for each slide
for i, importance in enumerate(hybrid_importance_scores, 1):
    print(f"Hybrid Importance of Slide {i}: {importance}")

Hybrid Importance of Slide 1: 1.0140997376292944
Hybrid Importance of Slide 2: 2.0181620717048645
Hybrid Importance of Slide 3: 2.018252218142152
Hybrid Importance of Slide 4: 1.01821999065578
Hybrid Importance of Slide 5: 1.0184086859226227
Hybrid Importance of Slide 6: 1.01366254594177
Hybrid Importance of Slide 7: 1.0185625571757555
Hybrid Importance of Slide 8: 1.0185863003134727
Hybrid Importance of Slide 9: 1.0177158005535603
Hybrid Importance of Slide 10: 1.0122740222141147
Hybrid Importance of Slide 11: 1.0180562734603882
Hybrid Importance of Slide 12: 1.0184220410883427
Hybrid Importance of Slide 13: 2.017707847058773
Hybrid Importance of Slide 14: 1.0186246000230312
Hybrid Importance of Slide 15: 1.0186100732535124
Hybrid Importance of Slide 16: 1.0173608493059874
Hybrid Importance of Slide 17: 1.0173679776489735
Hybrid Importance of Slide 18: 1.0175767801702023
Hybrid Importance of Slide 19: 1.0176824629306793
Hybrid Importance of Slide 20: 1.0173386223614216
Hybrid Importan