In [5]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract sentence embedding using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()  # Flatten to remove extra dimensions

# Function for extractive summarization
def extractive_summary(text, ratio=0.7):
    sentences = text.split('. ')
    sentence_embeddings = [get_sentence_embedding(sentence) for sentence in sentences]

    # Compute cosine similarity between sentences
    similarities = cosine_similarity(sentence_embeddings)

    # Sum the similarities for each sentence
    sentence_scores = similarities.sum(axis=1)

    # Determine the number of sentences to select based on the ratio
    top_n = max(1, int(len(sentences) * ratio))

    # Select sentences with the highest scores
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]
    top_sentence_indices.sort()  # Sort sentences by their original position

    # Join the selected sentences to form the summary
    summary = '. '.join([sentences[i] for i in top_sentence_indices])
    return summary

# Read data from CSV file
csv_input_file = './Datasets/combined_datasets.csv'  # Path to your input CSV file
csv_output_file = './preprocessed/testing.csv'  # Path to save the summarized output
df = pd.read_csv(csv_input_file)

# Ensure that the 3rd column (index 2) contains the content to be summarized
# If the column name is different, adjust the column index accordingly

summaries = []

# Loop through each row and summarize the content in the 3rd column
for _, row in df.iterrows():
    content = row.iloc[2]  # Adjust if your content column is not in the 3rd position
    summary = extractive_summary(content, ratio=0.7)
    summaries.append(summary)

# Add the summaries as a new column to the dataframe
df['Summary'] = summaries

# Save the summarized data to a new CSV file
df.to_csv(csv_output_file, index=False)

# Print the summarized data
print(f"Summarized data has been saved to {csv_output_file}")
print(df[['Summary']].head())

Summarized data has been saved to ./preprocessed/testing.csv
                                             Summary
0  Bitcoin (BTC) baru-baru ini mencapai titik ter...
1  Lonjakan nilai koin ini telah memungkinkan ban...
2  Institut Keuangan Korsel, yang diwakili oleh p...
3  Peringkat ini menyoroti kehadiran signifikan D...
4  Harga Ripple (XRP) saat ini mengalami trenbear...


In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = AutoModel.from_pretrained("indolem/indobert-base-uncased")

# Load BERT tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract sentence embedding using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()  # Flatten to remove extra dimensions

# Function for extractive summarization
def extractive_summary(text, ratio=0.7):
    sentences = text.split('. ')
    sentence_embeddings = [get_sentence_embedding(sentence) for sentence in sentences]

    # Compute cosine similarity between sentences
    similarities = cosine_similarity(sentence_embeddings)

    # Sum the similarities for each sentence
    sentence_scores = similarities.sum(axis=1)

    # Determine the number of sentences to select based on the ratio
    top_n = max(1, int(len(sentences) * ratio))

    # Select sentences with the highest scores
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]
    top_sentence_indices.sort()  # Sort sentences by their original position

    # Join the selected sentences to form the summary
    summary = '. '.join([sentences[i] for i in top_sentence_indices])
    return summary

# Read data from CSV file
csv_input_file = './Datasets/combined_datasets.csv'  # Path to your input CSV file
csv_output_file = './preprocessed/testing(2).csv'  # Path to save the summarized output
df = pd.read_csv(csv_input_file)

# Ensure that the columns for 'title', 'date', and 'content' are correctly identified
# Adjust the column names if they are different in your dataset
title_column = 'Title'   # Column name for title
date_column = 'Date'  # Column name for date
content_column = 'Summary'  # Column name for content to be summarized

summaries = []

# Loop through each row and summarize the content
for _, row in df.iterrows():
    content = row[content_column]
    summary = extractive_summary(content, ratio=0.7)
    summaries.append(summary)

# Add the summaries and retain title and date columns in the new DataFrame
df_summary = df[[title_column, date_column]].copy()
df_summary['Summary'] = summaries

# Save the summarized data to a new CSV file
df_summary.to_csv(csv_output_file, index=False)

# Print the summarized data
print(f"Summarized data has been saved to {csv_output_file}")
print(df_summary.head())

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Summarized data has been saved to ./preprocessed/testing(2).csv
                                               Title          Date  \
0  Michael Saylor Tetap Optimis Meski Harga BTC M...  23 June 2024   
1  Masa Depan Toncoin: Harga TON di Ambang Kebang...  23 June 2024   
2  Think Tank Korea Selatan Ragukan ETF Bitcoin, ...  24 June 2024   
3  Dogwifhat (WIF) Keluar dari 50 Besar, Harga An...  24 June 2024   
4  XRP Ripple Terus Berkonsolidasi, Kapan Harga A...  24 June 2024   

                                             Summary  
0  Bitcoin (BTC) baru-baru ini mencapai titik ter...  
1  Lonjakan nilai koin ini telah memungkinkan ban...  
2  Institut Keuangan Korsel, yang diwakili oleh p...  
3  Dalam peristiwa dramatis,memecoinberbasis Sola...  
4  Harga Ripple (XRP) saat ini mengalami trenbear...  
