In [1]:
# Run this cell to import necessary libraries
import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\biswa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\biswa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\biswa\AppData\Roaming\nltk_data...


True

In [4]:
import os
import pandas as pd

# Define the main directory path
path = r"E:\Infosys project\BBC News Summary\News Articles"

data = []

# Loop through each subdirectory in the main directory
for folder in os.listdir(path):
    folder_path = os.path.join(path, folder)
    if os.path.isdir(folder_path):  # Check if it's a directory
        # Loop through each file in the subdirectory
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                    content = file.read()
                    data.append({"filename": filename, "content": content})

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Display the DataFrame
df.head()


Unnamed: 0,filename,content
0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...
1,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...
2,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...
3,004.txt,High fuel prices hit BA's profits\n\nBritish A...
4,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...


In [5]:
# Check the number of documents and view a sample
print("Number of documents:", len(df))
print("Sample content:\n", df['content'].iloc[0])


Number of documents: 2225
Sample content:
 Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner int

In [6]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define cleaning functions
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

def remove_stop_words(text):
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

def lemmatize_text(text):
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

# Apply the functions
df['cleaned_content'] = df['content'].apply(clean_text).apply(remove_stop_words).apply(lemmatize_text)
df[['filename', 'cleaned_content']].head()


Unnamed: 0,filename,cleaned_content
0,001.txt,ad sale boost time warner profit quarterly pro...
1,002.txt,dollar gain greenspan speech dollar hit highes...
2,003.txt,yukos unit buyer face loan claim owner embattl...
3,004.txt,high fuel price hit ba profit british airway b...
4,005.txt,pernod takeover talk lift domecq share uk drin...


In [7]:
# Tokenize into sentences for summarization at the sentence level
df['sentences'] = df['cleaned_content'].apply(sent_tokenize)
df[['filename', 'sentences']].head()


Unnamed: 0,filename,sentences
0,001.txt,[ad sale boost time warner profit quarterly pr...
1,002.txt,[dollar gain greenspan speech dollar hit highe...
2,003.txt,[yukos unit buyer face loan claim owner embatt...
3,004.txt,[high fuel price hit ba profit british airway ...
4,005.txt,[pernod takeover talk lift domecq share uk dri...


In [8]:
# Convert cleaned text to TF-IDF matrix
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['cleaned_content'])
print("TF-IDF Matrix shape:", tfidf_matrix.shape)


TF-IDF Matrix shape: (2225, 26243)


In [9]:
# Save the DataFrame to a CSV file for later use
df.to_csv("processed_bbc_news_summary.csv", index=False)


In [1]:
import pandas as pd

# Load preprocessed data
df = pd.read_csv("processed_bbc_news_summary.csv")
print(df.head())


  filename                                            content  \
0  001.txt  Ad sales boost Time Warner profit\n\nQuarterly...   
1  002.txt  Dollar gains on Greenspan speech\n\nThe dollar...   
2  003.txt  Yukos unit buyer faces loan claim\n\nThe owner...   
3  004.txt  High fuel prices hit BA's profits\n\nBritish A...   
4  005.txt  Pernod takeover talk lifts Domecq\n\nShares in...   

                                     cleaned_content  \
0  ad sale boost time warner profit quarterly pro...   
1  dollar gain greenspan speech dollar hit highes...   
2  yukos unit buyer face loan claim owner embattl...   
3  high fuel price hit ba profit british airway b...   
4  pernod takeover talk lift domecq share uk drin...   

                                           sentences  
0  ['ad sale boost time warner profit quarterly p...  
1  ['dollar gain greenspan speech dollar hit high...  
2  ['yukos unit buyer face loan claim owner embat...  
3  ['high fuel price hit ba profit british airway...

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust `max_features` based on your dataset
tfidf_matrix = vectorizer.fit_transform(df['content'])

# Check the shape of the TF-IDF matrix
print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (2225, 5000)


In [7]:
from nltk.tokenize import sent_tokenize

df['sentences'] = df['content'].apply(sent_tokenize)
print(df['sentences'][0])  # Example: print sentences from the first document


['Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.', 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.', 'TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.', 'Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.', 'Time Warner said on Friday that it now owns 8% of search-engine Google.', 'But its own internet business, AOL, had has mixed fortunes.', 'It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.', "However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.", "It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a

In [8]:
import numpy as np

def sentence_scores(content, vectorizer, tfidf_matrix):
    scores = []
    sentences = sent_tokenize(content)
    tfidf_array = tfidf_matrix.toarray()
    for sentence in sentences:
        words = sentence.split()
        score = np.mean([vectorizer.vocabulary_.get(word.lower(), 0) for word in words])
        scores.append((sentence, score))
    return sorted(scores, key=lambda x: x[1], reverse=True)

# Example: Calculate scores for the first document
first_doc_scores = sentence_scores(df['content'][0], vectorizer, tfidf_matrix[0])
print(first_doc_scores[:5])  # Top 5 scored sentences


[('It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.', 2867.0), ('The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m.', 2774.8333333333335), ('It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.', 2700.5238095238096), ('Time Warner said on Friday that it now owns 8% of search-engine Google.', 2667.3076923076924), ("It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband.", 2572.4814814814813)]


In [9]:
def extract_summary(content, vectorizer, tfidf_matrix, n_sentences=3):
    scored_sentences = sentence_scores(content, vectorizer, tfidf_matrix)
    top_sentences = sorted(scored_sentences[:n_sentences], key=lambda x: content.index(x[0]))
    summary = " ".join([sentence for sentence, score in top_sentences])
    return summary

# Example: Summarize the first document
summary = extract_summary(df['content'][0], vectorizer, tfidf_matrix[0], n_sentences=3)
print("Summary:", summary)


Summary: It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. The company said it was unable to estimate the amount it needed to set aside for legal reserves, which it previously set at $500m. It will now book the sale of its stake in AOL Europe as a loss on the value of that stake.


In [10]:
df['summary'] = df['content'].apply(lambda x: extract_summary(x, vectorizer, tfidf_matrix, n_sentences=3))
print(df[['content', 'summary']].head())


                                             content  \
0  Ad sales boost Time Warner profit\n\nQuarterly...   
1  Dollar gains on Greenspan speech\n\nThe dollar...   
2  Yukos unit buyer faces loan claim\n\nThe owner...   
3  High fuel prices hit BA's profits\n\nBritish A...   
4  Pernod takeover talk lifts Domecq\n\nShares in...   

                                             summary  
0  It lost 464,000 subscribers in the fourth quar...  
1  Dollar gains on Greenspan speech\n\nThe dollar...  
2  Legal experts said Rosneft's purchase of Yugan...  
3  However, it said sales would be better than pr...  
4  Reports in the Wall Street Journal and the Fin...  


In [11]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24970 sha256=1efab56e5985adf8c083b0dd3ef1244dd3ab202bb32eab85caaab1a862ee58fa
  Stored in directory: c:\users\biswa\appdata\local\pip\cache\wheels\1e\19\43\8a442dc83660ca25e16


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
df.to_csv("extractive_summaries.csv", index=False)

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
for i in range(len(df)):
    scores = scorer.score(df['reference_summary'][i], df['summary'][i])
    print(scores)


In [19]:
# Reload the updated CSV with reference summaries
df = pd.read_csv("extractive_summaries.csv")


In [21]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("extractive_summaries.csv")

# Display the first few rows
print(df.head())

# Check if the `summary` column contains meaningful data
print("Summary column examples:")
for i in range(5):
    print(f"Original: {df['content'][i]}")
    print(f"Summary: {df['summary'][i]}")
    print("-" * 50)


  filename                                            content  \
0  001.txt  Ad sales boost Time Warner profit\n\nQuarterly...   
1  002.txt  Dollar gains on Greenspan speech\n\nThe dollar...   
2  003.txt  Yukos unit buyer faces loan claim\n\nThe owner...   
3  004.txt  High fuel prices hit BA's profits\n\nBritish A...   
4  005.txt  Pernod takeover talk lifts Domecq\n\nShares in...   

                                     cleaned_content  \
0  ad sale boost time warner profit quarterly pro...   
1  dollar gain greenspan speech dollar hit highes...   
2  yukos unit buyer face loan claim owner embattl...   
3  high fuel price hit ba profit british airway b...   
4  pernod takeover talk lift domecq share uk drin...   

                                           sentences  \
0  ['Ad sales boost Time Warner profit\n\nQuarter...   
1  ['Dollar gains on Greenspan speech\n\nThe doll...   
2  ['Yukos unit buyer faces loan claim\n\nThe own...   
3  ["High fuel prices hit BA's profits\n\nBritis

In [22]:
# Check word counts
df['content_word_count'] = df['content'].apply(lambda x: len(str(x).split()))
df['summary_word_count'] = df['summary'].apply(lambda x: len(str(x).split()))

# Display word counts
print(df[['content_word_count', 'summary_word_count']].head())


   content_word_count  summary_word_count
0                 421                  62
1                 384                  71
2                 264                  65
3                 406                  47
4                 265                  72


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(original, summarized):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([original, summarized])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

# Calculate similarity for the first 5 rows
for i in range(5):
    similarity = calculate_similarity(df['content'][i], df['summary'][i])
    print(f"Similarity for row {i}: {similarity}")


Similarity for row 0: 0.6257514562660176
Similarity for row 1: 0.7361349546279353
Similarity for row 2: 0.6497413573733606
Similarity for row 3: 0.3738777690819517
Similarity for row 4: 0.6446343057040622


In [39]:
import os

# Directory path
directory_path =  r"E:\Infosys project\BBC News Summary\Summaries"

# Count the number of text files in the directory
file_count = sum(len(files) for _, _, files in os.walk(directory_path))

# Compare with the number of rows in the CSV
csv_row_count = len(df)

print(f"Files in directory: {file_count}")
print(f"Rows in CSV: {csv_row_count}")

if file_count == csv_row_count:
    print("All files successfully processed!")
else:
    print(f"Mismatch: {file_count - csv_row_count} files were not processed.")


Files in directory: 2225
Rows in CSV: 2225
All files successfully processed!


In [40]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate summaries against reference summaries
if 'reference_summary' in df.columns:
    for i in range(5):  # Adjust for more rows
        scores = scorer.score(df['reference_summary'][i], df['summary'][i])
        print(f"ROUGE scores for row {i}: {scores}")
else:
    print("Reference summaries not found. ROUGE evaluation skipped.")


Reference summaries not found. ROUGE evaluation skipped.


In [26]:
# Save validation results to a file
validation_results = {
    "files_in_directory": file_count,
    "rows_in_csv": csv_row_count,
    "status": "Success" if file_count == csv_row_count else "Mismatch",
}

# Save as a JSON file
import json
with open("validation_results.json", "w") as log_file:
    json.dump(validation_results, log_file)

print("Validation results saved!")


Validation results saved!


In [31]:
import pandas as pd

# Load the saved dataset
data = pd.read_csv("extractive_summaries.csv")

def summarize_text(input_text):
    """
    Function to summarize text using the preprocessed dataset.
    """
    # Implement similarity or TF-IDF ranking logic here, or use the saved summaries
    # For simplicity, we fetch summaries directly from the CSV.
    # You can replace this with your summarization algorithm.
    matched_row = data[data['content'].str.contains(input_text[:100], na=False)]
    if not matched_row.empty:
        return matched_row.iloc[0]['summary']
    return "No relevant summary found."

# Example usage
input_text = "Type or paste a piece of text here for summarization..."
summary = summarize_text(input_text)
print(f"Summary:\n{summary}")


Summary:
No relevant summary found.


In [1]:
import tkinter as tk
from tkinter import Text

def summarize():
    input_text = text_input.get("1.0", "end-1c")
    summary = summarize_text(input_text)
    result_label.config(text=f"Summary:\n{summary}")

# Create the GUI
root = tk.Tk()
root.title("Text Summarization")

# Input Text Area
text_input = Text(root, height=10, width=50)
text_input.pack(pady=10)

# Summarize Button
summarize_button = tk.Button(root, text="Summarize", command=summarize)
summarize_button.pack(pady=5)

# Output Label
result_label = tk.Label(root, text="", wraplength=400, justify="left")
result_label.pack(pady=10)

root.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\biswa\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\biswa\AppData\Local\Temp\ipykernel_28044\612476297.py", line 6, in summarize
    summary = summarize_text(input_text)
              ^^^^^^^^^^^^^^
NameError: name 'summarize_text' is not defined
Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\biswa\AppData\Local\Programs\Python\Python311\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\biswa\AppData\Local\Temp\ipykernel_28044\612476297.py", line 6, in summarize
    summary = summarize_text(input_text)
              ^^^^^^^^^^^^^^
NameError: name 'summarize_text' is not defined
Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\biswa\AppData\Local\Programs

In [33]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting protobuf<6,>=3.20 (from streamlit)
  Downloading protobuf-5.29.0-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-18.1.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gi


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
import streamlit as st
import pandas as pd

# Load data
data = pd.read_csv("extractive_summaries.csv")

def summarize_text(input_text):
    matched_row = data[data['content'].str.contains(input_text[:100], na=False)]
    if not matched_row.empty:
        return matched_row.iloc[0]['summary']
    return "No relevant summary found."

# GUI Layout
st.title("Text Summarization App")
st.subheader("Enter the text below:")

input_text = st.text_area("Your Text:")
if st.button("Summarize"):
    summary = summarize_text(input_text)
    st.subheader("Summary:")
    st.write(summary)


2024-11-28 13:10:52.475 
  command:

    streamlit run C:\Users\biswa\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-11-28 13:10:52.484 Session state does not function when running a script without `streamlit run`


In [35]:
[theme]
primaryColor = "#4CAF50"
backgroundColor = "#FFFFFF"
secondaryBackgroundColor = "#F5F5F5"
textColor = "#000000"
font = "sans serif"


NameError: name 'theme' is not defined

In [38]:
import os
import pandas as pd

# Load system-generated summaries
system_summaries_path = "extractive_summaries.csv"
system_df = pd.read_csv(system_summaries_path)

# Path to reference summaries
reference_summaries_path = r"E:\Infosys project\BBC News Summary\Summaries"

# Read all reference summaries
reference_data = []
for root, _, files in os.walk(reference_summaries_path):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            reference_data.append({"filename": file, "reference_summary": content})

# Create a DataFrame for reference summaries
reference_df = pd.DataFrame(reference_data)

In [41]:
# Ensure filenames in system_df match those in reference_df
system_df['filename'] = system_df['filename'].str.strip()
reference_df['filename'] = reference_df['filename'].str.strip()

# Merge both DataFrames based on the filename
merged_df = pd.merge(system_df, reference_df, on="filename")


In [42]:
pip install rouge-score

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate ROUGE scores for each summary pair
results = []
for _, row in merged_df.iterrows():
    system_summary = row['summary']
    reference_summary = row['reference_summary']

    # Compute ROUGE scores
    scores = scorer.score(reference_summary, system_summary)
    results.append(scores)

# Convert results to a DataFrame for analysis
rouge_results_df = pd.DataFrame(results)

# Display results
print(rouge_results_df.head())


                                              rouge1  \
0  (0.5714285714285714, 0.23529411764705882, 0.33...   
1   (0.2857142857142857, 0.18556701030927836, 0.225)   
2  (0.42857142857142855, 0.1323529411764706, 0.20...   
3  (0.2857142857142857, 0.1875, 0.22641509433962265)   
4  (0.3333333333333333, 0.0707070707070707, 0.116...   

                                              rouge2  \
0  (0.3225806451612903, 0.13157894736842105, 0.18...   
1  (0.03225806451612903, 0.020833333333333332, 0....   
2  (0.04838709677419355, 0.014778325123152709, 0....   
3                                    (0.0, 0.0, 0.0)   
4  (0.03225806451612903, 0.006756756756756757, 0....   

                                              rougeL  
0  (0.2857142857142857, 0.11764705882352941, 0.16...  
1   (0.1746031746031746, 0.1134020618556701, 0.1375)  
2  (0.2698412698412698, 0.08333333333333333, 0.12...  
3  (0.15873015873015872, 0.10416666666666667, 0.1...  
4  (0.25396825396825395, 0.05387205387205387, 0.0..

In [46]:
import pandas as pd

# Example of how `rouge_results_df` might look
rouge_results_df = pd.DataFrame({
    "rouge-1": [(0.5, 0.4, 0.6), (0.6, 0.5, 0.7)],
    "rouge-2": [(0.3, 0.2, 0.4), (0.4, 0.3, 0.5)],
    "rouge-L": [(0.4, 0.3, 0.5), (0.5, 0.4, 0.6)]
})

# Convert tuples/lists to their mean values
flattened_df = rouge_results_df.applymap(
    lambda x: x[0] if isinstance(x, (tuple, list)) else x
)

# Now calculate the average scores
average_scores = flattened_df.mean()
print("Average ROUGE Scores:")
print(average_scores)


Average ROUGE Scores:
rouge-1    0.55
rouge-2    0.35
rouge-L    0.45
dtype: float64


  flattened_df = rouge_results_df.applymap(


In [4]:
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the CSV file
@st.cache_data
def load_csv(file_path):
    return pd.read_csv(file_path)

# Load pre-trained model and tokenizer
@st.cache_resource
def load_model():
    model = BartForConditionalGeneration.from_pretrained("path_to_your_model")
    tokenizer = BartTokenizer.from_pretrained("path_to_your_model")
    return model, tokenizer

# Function to generate summaries
def generate_summary(text, model, tokenizer, max_length=130, min_length=30, length_penalty=2.0, num_beams=4):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=length_penalty,
        num_beams=num_beams
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Streamlit app layout
st.title("Text Summarization Tool")
st.write("View summaries from the dataset or generate new ones!")

# Sidebar for navigation
option = st.sidebar.selectbox("Choose an action:", ["View Summaries", "Generate Summary"])

# Load the model and tokenizer
model, tokenizer = load_model()

if option == "View Summaries":
    # Load the saved CSV file
    df = load_csv("extractive_summaries.csv")

    # Display summaries
    st.subheader("Summaries from the Dataset")
    st.dataframe(df)

    # Search by filename or content
    search_text = st.text_input("Search summaries by filename or content:")
    if search_text.strip():
        filtered_df = df[df['content'].str.contains(search_text, case=False, na=False) | df['filename'].str.contains(search_text, case=False, na=False)]
        st.write(f"Found {len(filtered_df)} matching results:")
        st.dataframe(filtered_df)

elif option == "Generate Summary":
    # Text input for new summary
    st.subheader("Generate a New Summary")
    input_text = st.text_area("Enter the text to summarize", height=200)
    
    if st.button("Summarize"):
        if input_text.strip():
            st.write("Generating summary...")
            summary = generate_summary(input_text, model, tokenizer)
            st.subheader("Generated Summary:")
            st.write(summary)
        else:
            st.warning("Please enter some text to summarize.")



  from .autonotebook import tqdm as notebook_tqdm
2024-12-13 10:02:53.278 
  command:

    streamlit run C:\Users\biswa\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-12-13 10:02:53.291 Session state does not function when running a script without `streamlit run`


ImportError: 
BartForConditionalGeneration requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFBartForConditionalGeneration".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [3]:
pip install transformers


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.24.0->transformers)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   --------- ------------------------------ 2.4/10.1 MB 13.4 MB/s eta 0:00:01
   ------------------- -------------------- 5.0/10.1 MB 12.6 MB/s eta

In [5]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp311-cp311-win_amd64.whl (205.5 MB)
     ---------------------------------------- 0.0/205.5 MB ? eta -:--:--
     --------------------------------------- 1.6/205.5 MB 10.5 MB/s eta 0:00:20
      -------------------------------------- 3.9/205.5 MB 11.2 MB/s eta 0:00:19
     - ------------------------------------- 6.8/205.5 MB 12.0 MB/s eta 0:00:17
     - ------------------------------------- 9.4/205.5 MB 12.0 MB/s eta 0:00:17
     -- ----------------------------------- 11.8/205.5 MB 11.9 MB/s eta 0:00:17
     -- ----------------------------------- 12.3/205.5 MB 11.9 MB/s eta 0:00:17
     -- ----------------------------------- 15.2/205.5 MB 11.0 MB/s eta 0:00:18
     --- ---------------------------------- 17.6/205.5 MB 11.1 MB/s eta 0:00:17
     --- ---------------------------------- 19.9/205.5 MB 11.1 MB/s eta 0:00:17
     ---- -----------------

In [6]:
import torch
print(torch.__version__)


2.5.1+cpu
