In [None]:
!pip install transformers

In [None]:
pip install ipywidgets

In [None]:
pip install neattext

In [None]:
pip install accelerate>=0.26.0

In [4]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import neattext as nt

# List of CSV file paths
csv_paths = [
    '/data_sets/Hp Pavilion 14-inch.csv',
    '/data_sets/Lenoevo Ideapad slim3.csv',
    '/data_sets/Macbook Air M1.csv'
]

# Function to preprocess text
def preprocess_with_neattext(text):
    text_frame = nt.TextFrame(text)
    text_frame.remove_multiple_spaces()
    text_frame.remove_html_tags()
    text_frame.remove_stopwords()
    text_frame.remove_non_ascii()
    text_frame.remove_userhandles()
    text_frame.remove_hashtags()
    text_frame.remove_emojis()
    return text_frame.text

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=0)

# Process each CSV file
for i, csv_path in enumerate(tqdm(csv_paths, desc="Processing CSV files")):
    # Load the CSV file
    mbook_df = pd.read_csv(csv_path)
    
    # Combine all rows of the 'review_text' column into a single string
    full_desc = ' '.join(mbook_df['review_text'].dropna().astype(str))
    
    # Preprocess the text
    clean_full_desc = preprocess_with_neattext(full_desc)
    clean_full_desc = clean_full_desc.replace('.', '.<eos>')
    clean_full_desc = clean_full_desc.replace('?', '?<eos>')
    clean_full_desc = clean_full_desc.replace('!', '!<eos>')
    
    # Split text into chunks for summarization
    max_chunk = 500
    sentences = clean_full_desc.split('<eos>')
    current_chunk = 0
    chunks = []
    
    for sentence in tqdm(sentences, desc=f"Creating chunks for CSV {i+1}"):
        if len(chunks) == current_chunk + 1:
            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(sentence.split(' '))
            else:
                current_chunk += 1
                chunks.append(sentence.split(' '))
        else:
            chunks.append(sentence.split(' '))
    
    # Join words in each chunk to form sentences
    for chunk_id in tqdm(range(len(chunks)), desc=f"Joining chunks for CSV {i+1}"):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])
    
    # Summarize each chunk
    res = []
    for chunk in tqdm(chunks, desc=f"Summarizing chunks for CSV {i+1}"):
        summary = summarizer(chunk, max_length=140, min_length=30, do_sample=False)
        res.extend(summary)
    
    # Combine all summaries into a single text
    summary_text = ' '.join([summ['summary_text'] for summ in res])
    
    # Summarize the combined summary text for a final concise summary
    final_summary = summarizer(summary_text, max_length=300, min_length=100, do_sample=False)
    
    # Save the final summary to a text file
    output_file = f'txt{i+1}.txt'
    with open(output_file, 'w') as f:
        f.write(final_summary[0]['summary_text'])
    
    print(f"Summary for {csv_path} saved to {output_file}")
    print("---------------------------")

Creating chunks for CSV 1: 100%|██████████| 626/626 [00:00<00:00, 895815.18it/s]
Joining chunks for CSV 1: 100%|██████████| 12/12 [00:00<00:00, 52211.25it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (1766 > 1024). Running this sequence through the model will result in indexing errors
Summarizing chunks for CSV 1: 100%|██████████| 12/12 [00:36<00:00,  3.08s/it]
Processing CSV files:  33%|███▎      | 1/3 [00:41<01:22, 41.23s/it]

Summary for /Users/pardhasaradhichukka/Desktop/Project/Reboot/pracrtise/data_sets/Hp Pavilion 14-inch.csv saved to txt1.txt


Creating chunks for CSV 2: 100%|██████████| 315/315 [00:00<00:00, 948460.70it/s]
Joining chunks for CSV 2: 100%|██████████| 6/6 [00:00<00:00, 48865.68it/s]
Summarizing chunks for CSV 2: 100%|██████████| 6/6 [00:15<00:00,  2.63s/it]
Processing CSV files:  67%|██████▋   | 2/3 [01:01<00:28, 28.92s/it]

Summary for /Users/pardhasaradhichukka/Desktop/Project/Reboot/pracrtise/data_sets/Lenoevo Ideapad slim3.csv saved to txt2.txt


Creating chunks for CSV 3: 100%|██████████| 441/441 [00:00<00:00, 578569.93it/s]
Joining chunks for CSV 3: 100%|██████████| 7/7 [00:00<00:00, 45449.11it/s]
Summarizing chunks for CSV 3: 100%|██████████| 7/7 [00:17<00:00,  2.45s/it]
Processing CSV files: 100%|██████████| 3/3 [01:23<00:00, 27.67s/it]

Summary for /Users/pardhasaradhichukka/Desktop/Project/Reboot/pracrtise/data_sets/Macbook Air M1.csv saved to txt3.txt



