# Install Dependencies

In [2]:
!pip install beautifulsoup4



In [3]:
!pip install chardet



In [4]:
!pip install nltk



# Import Liberaries

In [5]:
from bs4 import BeautifulSoup

In [6]:
import chardet

In [7]:
import re

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saicharangankidi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
import os

In [10]:
import sys

# Extract Text from HTML Document

In [11]:
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        soup = BeautifulSoup(file, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
    return text

# Clean Text

In [15]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Normalization

In [37]:
def normalize_text(text):
    text = text.lower()
    return text

# Segmentation

In [38]:
def segment_text(text):
    sentences = sent_tokenize(text)
    return sentences

# Save Sentences

In [39]:
def save_sentences(sentences, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

# Complete Processing

In [40]:
def process_transcripts(root_folder, output_root):
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith('.html'):
                file_path = os.path.join(root, file)
                text = extract_text_from_html(file_path)
                sentences=segment_text(text)
                cleaned_sentences = [clean_text(sentence) for sentence in sentences]
                normalized_sentences = [normalize_text(sentence) for sentence in cleaned_sentences]
                sentences=normalized_sentences
                relative_path = os.path.relpath(root, root_folder)
                output_dir = os.path.join(output_root, relative_path)
                output_file = os.path.splitext(file)[0] + '.txt'
                output_path = os.path.join(output_dir, output_file)
                save_sentences(sentences, output_path)
 

# Define Paths

In [41]:
root_folder = 'Data'
output_root = 'Segmented_Sentences'
if not os.path.exists(output_root):
    os.makedirs(output_root)

In [17]:
process_transcripts(root_folder, output_root)

In [18]:
pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [20]:
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

def remove_stop_words(text):
    doc = nlp(text)
    # Join the tokens that are not stop words
    filtered_text = " ".join([token.text for token in doc if not token.is_stop])
    return filtered_text


In [21]:
import os

def process_texts_remove_stopwords(source_dir, target_dir):
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.txt'):
                # Construct the source file path
                file_path = os.path.join(root, file)
                
                # Read the content of the source file
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Remove stop words from the content
                cleaned_text = remove_stop_words(text)
                
                # Construct the target file path
                relative_path = os.path.relpath(root, source_dir)
                target_file_path = os.path.join(target_dir, relative_path, file)
                
                # Ensure the target directory exists
                os.makedirs(os.path.dirname(target_file_path), exist_ok=True)
                with open(target_file_path, 'w', encoding='utf-8') as f:
                    f.write(cleaned_text)


In [22]:
source_directory = 'Segmented_Sentences'
target_directory = 'cleaned_lectures'
process_texts_remove_stopwords(source_directory, target_directory)

In [53]:
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_keywords_tfidf(documents, top_n=50):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    keywords_list = []
    for doc_id in range(X.shape[0]):
        feature_index = X[doc_id,:].nonzero()[1]
        tfidf_scores = zip([feature_names[i] for i in feature_index], [X[doc_id, x] for x in feature_index])
        sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:top_n]
        keywords_list.append([item[0] for item in sorted_items])
    return keywords_list


In [42]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [55]:
import os

def process_and_extract_keywords(source_dir, target_dir, top_n_keywords=50):
    for root, dirs, files in os.walk(source_dir):
        # Determine the path for output based on the current directory structure
        rel_path = os.path.relpath(root, source_dir)
        output_dir = os.path.join(target_dir, rel_path)
        
        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = [f.read()]
                
                # Extract keywords for the document
                keywords = extract_keywords_tfidf(text, top_n=top_n_keywords)[0]
                
                # Construct the output filename and write the keywords
                output_file_path = os.path.join(output_dir, os.path.splitext(file)[0] + '_keywords.txt')
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    f.write('\n'.join(keywords))



In [56]:
# Define your source and target directories
source_directory = 'cleaned_lectures'
target_directory = 'keywords'

process_and_extract_keywords(source_directory, target_directory)


In [48]:

import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saicharangankidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [64]:
from rake_nltk import Rake
import os
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Extend the default NLTK stop words list with additional words as needed
extended_stopwords = set(stopwords.words('english'))
# Add 'got' and any other words you deem necessary
extended_stopwords.update(['got', 'like', 'just', 'know', 'think', 'see', 'really', 'said'])

# Preprocess the document to remove numbers
def preprocess_document(document):
    # Remove numbers
    document_no_numbers = re.sub(r'\b\d+\b', '', document)
    return document_no_numbers

def extract_keywords_rake(document, top_n=50, min_length=4, max_length=20, min_words=2, max_words=4):
    # Preprocess document to remove numbers
    document = preprocess_document(document)
    
    # Initialize RAKE with the extended list of stopwords
    rake = Rake(stopwords=extended_stopwords, min_length=min_length, max_length=max_length)
    
    # Extract keywords from the document
    rake.extract_keywords_from_text(document)
    
    # Get keyword phrases ranked highest to lowest with scores
    ranked_phrases_with_scores = rake.get_ranked_phrases_with_scores()
    
    # Filter phrases based on the number of words
    filtered_phrases = [phrase for score, phrase in ranked_phrases_with_scores if min_words <= len(phrase.split()) <= max_words]
    
    # Select the top_n phrases based on the adjusted list
    top_keywords = filtered_phrases[:top_n]
    
    return top_keywords


def process_and_extract_keywords_rake(source_dir, target_dir, top_n_keywords=50, min_length=3, max_length=20, min_words=1, max_words=3):
    for root, dirs, files in os.walk(source_dir):
        rel_path = os.path.relpath(root, source_dir)
        output_dir = os.path.join(target_dir, rel_path)
        
        os.makedirs(output_dir, exist_ok=True)
        
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Extract keywords for the document
                keywords = extract_keywords_rake(text, top_n=top_n_keywords, min_length=min_length, max_length=max_length, min_words=min_words, max_words=max_words)
                
                # Construct the output filename and write the keywords
                output_file_path = os.path.join(output_dir, os.path.splitext(file)[0] + '_keywords.txt')
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    f.write('\n'.join(keywords))

# Define your source and target directories
source_directory = 'cleaned_lectures'  # Update this path as necessary
target_directory = 'keywords_rake'  # Update this path as necessary

process_and_extract_keywords_rake(source_directory, target_directory)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saicharangankidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def perform_ner(keywords):
    # Join keywords into a single text string for NER
    text = " ".join(keywords)
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


In [24]:
import os

def process_keyword_files_for_ner(source_dir, target_dir):
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('_keywords.txt'):  # Assuming your keyword files have a specific naming pattern
                file_path = os.path.join(root, file)
                
                # Read the keywords from the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    keywords = f.read().splitlines()
                
                # Perform NER on the list of keywords
                entities = perform_ner(keywords)
                
                # Construct the target file path to save entities
                relative_path = os.path.relpath(root, source_dir)
                target_file_path = os.path.join(target_dir, relative_path, os.path.splitext(file)[0] + '_entities.txt')
                
                # Ensure the target directory exists
                os.makedirs(os.path.dirname(target_file_path), exist_ok=True)
                
                # Save the entities to the target file
                with open(target_file_path, 'w', encoding='utf-8') as f:
                    for entity, label in entities:
                        f.write(f"{entity} ({label})\n")


In [25]:

# Specify your source directory (where the keyword files are) and target directory (where you want to save entities)
source_directory = 'keywords'
target_directory = 'NER'

process_keyword_files_for_ner(source_directory, target_directory)


In [17]:
from summa import keywords as summa_keywords

def extract_keywords_textrank(text, top_n=10):
    try:
        # Attempt to extract top_n keywords with scores
        tr_keywords = summa_keywords.keywords(text, words=top_n, split=True, scores=True)
        tr_keywords_sorted = sorted(tr_keywords, key=lambda x: x[1], reverse=True)[:top_n]
        return [keyword for keyword, score in tr_keywords_sorted]
    except IndexError:
        # If an IndexError is raised, adjust to return available keywords
        tr_keywords = summa_keywords.keywords(text, split=True, scores=True)
        tr_keywords_sorted = sorted(tr_keywords, key=lambda x: x[1], reverse=True)
        return [keyword for keyword, score in tr_keywords_sorted][:top_n]

def main():
    # Path to your text file
    file_path = '/Users/saicharangankidi/Desktop/shruthiProject/Sentences/afm162/transcripts/transcript01.txt'  # Update this to the path of your text file

    # Read the content of the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text_input = file.read()

    # Extract keywords
    extracted_keywords = extract_keywords_textrank(text_input, top_n=10)

    # Print extracted keywords
    print("Extracted Keywords:")
    for keyword in extracted_keywords:
        print(keyword)

if __name__ == "__main__":
    main()


Extracted Keywords:
beauty
beautiful
beautifully
civil
civilization
civilized
sexuality
sexual
sexually
united
