<a href="https://colab.research.google.com/github/Sahil1694/AI-TY-Project-Text-Summarization/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install PyPDF2
!pip install nltk
!pip install scikit-learn
!pip install moviepy
!pip install SpeechRecognition
!pip install ipywidgets
!pip install pydub

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.4
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.19.1
Collecting pydub
  Downloa

In [2]:
# Import necessary libraries
import nltk
import numpy as np
import speech_recognition as sr
import moviepy.editor as mp
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import ipywidgets as widgets
from IPython.display import display
from google.colab import files
from pydub import AudioSegment
import PyPDF2


In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Function to preprocess text
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english') + list(punctuation))
    cleaned_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word not in stop_words]
        cleaned_sentences.append(' '.join(words))
    return sentences, cleaned_sentences

# Function to calculate word frequencies
def calculate_word_frequency(text):
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = [word for word in words if word not in stop_words]
    return Counter(words)

# Function to calculate cosine similarity
def cosine_similarity_matrix(cleaned_sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(cleaned_sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

# Function to score sentences
def score_sentences(sentences, cleaned_sentences, word_freq, similarity_matrix, alpha=0.5):
    sentence_scores = {}
    for i, sentence in enumerate(cleaned_sentences):
        freq_score = sum([word_freq[word] for word in sentence.split()])
        sim_score = np.mean(similarity_matrix[i])
        sentence_scores[sentences[i]] = alpha * freq_score + (1 - alpha) * sim_score

    # Normalize scores
    max_score = max(sentence_scores.values()) if sentence_scores else 1
    for sentence in sentence_scores:
        sentence_scores[sentence] /= max_score

    return sentence_scores

# Function to generate summary based on length preference
def generate_summary(text, length_mode="medium"):
    sentences, cleaned_sentences = preprocess_text(text)
    word_freq = calculate_word_frequency(text)
    similarity_matrix = cosine_similarity_matrix(cleaned_sentences)
    sentence_scores = score_sentences(sentences, cleaned_sentences, word_freq, similarity_matrix)

    ranked_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)

    # Determine the number of sentences based on length mode
    if length_mode == "low":
        top_n = max(1, len(sentences) // 5)  # 20% of sentences
    elif length_mode == "medium":
        top_n = max(1, len(sentences) // 3)  # 33% of sentences
    else:
        top_n = max(1, len(sentences) // 2)  # 50% of sentences

    summary_sentences = []
    seen_sentences = set()

    for sentence, score in ranked_sentences:
        if len(summary_sentences) < top_n and sentence not in seen_sentences:
            summary_sentences.append(sentence)
            seen_sentences.add(sentence)

    return ' '.join(summary_sentences)

# Function to extract audio from video
def extract_audio_from_video(video_path):
    video = mp.VideoFileClip(video_path)
    audio_path = "extracted_audio.wav"
    video.audio.write_audiofile(audio_path)
    return audio_path

# Function to convert audio to text
def audio_to_text(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        return "Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition service; {e}"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Function to handle text/PDF file upload
def upload_text_file(b):
    uploaded = files.upload()
    for filename in uploaded.keys():
        if filename.endswith('.txt'):
            with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()
            summary_output.value = generate_summary(text, length_mode=summary_length_slider.value)
        elif filename.endswith('.pdf'):
            text = extract_text_from_pdf(filename)
            summary_output.value = generate_summary(text, length_mode=summary_length_slider.value)

# Function to handle video file upload
def upload_video_file(b):
    uploaded = files.upload()
    for filename in uploaded.keys():
        video_path = filename
        audio_path = extract_audio_from_video(video_path)
        text = audio_to_text(audio_path)
        summary_output.value = generate_summary(text, length_mode=summary_length_slider.value)

# Function to create the GUI
def create_gui():
    # Header
    header = widgets.HTML(value="<h2 style='text-align: center;'>Enhanced Text Summarization Tool</h2>")

    # Description
    description = widgets.HTML(value="<p style='text-align: center;'>Upload a text, PDF, or video file to generate a summary.</p>")

    # Summary Output
    global summary_output
    summary_output = widgets.Textarea(
        value='',
        placeholder='Summary will appear here',
        description='Summary:',
        layout={'width': '90%', 'height': '200px'},
        style={'description_width': 'initial'}
    )

    # Slider for summary length mode
    global summary_length_slider
    summary_length_slider = widgets.RadioButtons(
        options=[('Low (short summary)', 'low'),
                 ('Medium (moderate summary)', 'medium'),
                 ('High (detailed summary)', 'high')],
        value='medium',
        description='Summary Length:',
        layout={'width': '90%'},
        style={'description_width': 'initial'}
    )

    # Buttons
    text_button = widgets.Button(
        description="Upload Text/PDF File",
        button_style='info',
        layout={'width': '90%', 'height': '50px'}
    )

    video_button = widgets.Button(
        description="Upload Video File",
        button_style='success',
        layout={'width': '90%', 'height': '50px'}
    )

    # Connect buttons to functions
    text_button.on_click(upload_text_file)
    video_button.on_click(upload_video_file)

    # Create a vertical box layout for the UI
    ui = widgets.VBox([header, description, summary_length_slider, text_button, video_button, summary_output])

    # Display the GUI
    display(ui)

# Call the function to create the GUI
create_gui()

VBox(children=(HTML(value="<h2 style='text-align: center;'>Enhanced Text Summarization Tool</h2>"), HTML(value…