In [None]:
import pandas as pd
import openai
import numpy as np
import os
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV

# Set your OpenAI API key
openai.api_key ="OPENAI_API_KEY"

# Load your data from CSV into a DataFrame
df = pd.read_csv('data\broken_summary_new_data.csv')

# Extract text data from the 'text' column
text_data = df['text'].tolist()

# Initialize an empty list to store embeddings
all_embeddings = []

# Define maximum chunk size (adjust as needed)
max_chunk_size = 500  # You can adjust this value based on your requirements

# Generate embeddings for each chunk of text
for summary in text_data:
    # Split the summary into chunks of maximum size
    chunks = [summary[i:i + max_chunk_size] for i in range(0, len(summary), max_chunk_size)]

    # Generate embeddings for each chunk
    chunk_embeddings = []
    for chunk in chunks:
        # Generate embeddings for the chunk using Ada-002
        embedding = openai.Embed("ada-002", texts=chunk)
        chunk_embeddings.append(embedding)
    
    # Aggregate embeddings of chunks (e.g., by averaging)
    aggregated_embedding = np.mean(chunk_embeddings, axis=0)
    
    # Append the aggregated embedding to the list of all embeddings
    all_embeddings.append(aggregated_embedding)

# Convert the list of embeddings to a numpy array
embeddings = np.array(all_embeddings)

# Create Birch clustering object
birch = Birch()

# Define hyperparameters for tuning
param_grid = {
    'threshold': [0.1, 0.5, 1.0],
    'branching_factor': [50, 100, 200]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(birch, param_grid, cv=5, scoring='silhouette')
grid_search.fit(embeddings)

# Get best parameters and best silhouette score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Silhouette Score:", best_score)

# Fit Birch clustering with best parameters
best_birch = Birch(**best_params)
best_birch.fit(embeddings)

# Predict clusters
labels = best_birch.predict(embeddings)


In [None]:
import pandas as pd
import requests

# Step 1: Parse CSV file into Whisper API format
def parse_transcripts_to_whisper_format(csv_file):
    # Read CSV file into DataFrame
    df = pd.read_csv(csv_file)
    
    # Group text by speaker
    grouped_text = df.groupby('speaker_label')['text'].apply(lambda x: ' '.join(x)).reset_index()
    
    # Concatenate text for each speaker into a single string
    conversations = grouped_text['text'].tolist()
    
    # Format conversations for Whisper API input
    whisper_input = {
        "input": "\n".join(conversations)
    }
    
    return whisper_input

# Step 2: Use the Whisper API to generate a summary
def generate_summary_with_whisper(whisper_input):
    # API endpoint for Whisper summarization
    api_endpoint = "https://api.openai.com/v1/agents/whisper/summaries"
    
    # Set your OpenAI API key
    api_key = 'your_openai_api_key'
    
    # Set request headers
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # Send request to Whisper API
    response = requests.post(api_endpoint, json=whisper_input, headers=headers)
    
    # Check if request was successful
    if response.status_code == 200:
        summary = response.json()['summary']
        return summary
    else:
        print("Error:", response.text)
        return None

# Step 3: Retrieve and process the summary generated by the Whisper API
def main(csv_file):
    # Step 1: Parse CSV file into Whisper API format
    whisper_input = parse_transcripts_to_whisper_format(csv_file)
    
    # Step 2: Use the Whisper API to generate a summary
    summary = generate_summary_with_whisper(whisper_input)
    
    # Step 3: Process and print the summary
    if summary:
        print("Summary:", summary)
    else:
        print("Failed to generate summary.")

# Replace 'your_openai_api_key' with your actual OpenAI API key
main('conversation_transcripts.csv')


In [None]:
from transformers import pipeline

# Load an extractive summarization pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Input text you want to summarize
text = """Your long text here"""

# Generate summary
summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]

# Print the summary
print(summary)


In [None]:
import pandas as pd
from transformers import pipeline

# Load an extractive summarization pipeline
summarizer_1 = pipeline("summarization", model="facebook/bart-large-mnli")
summarizer_2 = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")


# Load your CSV file
data = pd.read_csv("data\Shark_Tank_US_Top_3_Products_For_Gamers_1.csv")

# Extract the text from transcript
transcript_text = " ".join(data['text'].tolist())    # Replace "column_name" with your actual column name


# Generate summary
summary_1 = summarizer_1(transcript_text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
summary_2 = summarizer_2(transcript_text, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]

# Print the summary
print("summary_1: ", summary_1)
print("summary_2: ", summary_2)


In [1]:
from transformers import BartTokenizer, BartModel
import pandas as pd
import re
from transformers import pipeline

In [39]:
summarizer = pipeline("summarization", model="t5-small", max_length=1024, min_length=30, do_sample=True)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [40]:
# Load your CSV file
data = pd.read_csv("data\Shark_Tank_US_Top_3_Products_For_Gamers_1.csv", index_col=0)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
display(data.head())

Unnamed: 0,speaker_label,start_time,end_time,text
0,spk_0,0.088483,0.13665,Into the tank is an accessory for the tech gen...
1,spk_1,0.318983,2.75615,Hi. My name is Joe Croft. Im from San Clemente...
2,spk_2,2.756333,2.864483,So yellow light is more calming and better for...
3,spk_1,2.864667,3.019983,much better and why blue light which is very c...
4,spk_0,3.02465,3.098317,Tell us about your sales. Theres gonna be a lo...


In [41]:
# Extract the text from transcript
transcript_text = " ".join(data['text'].tolist())    # Replace "column_name" with your actual column name

newlines_reg = r"\n|\r|\r\n"  # Match `\n`, `\r`, or `\r\n`
transcript_text = re.sub(newlines_reg, " ", transcript_text)

print(transcript_text)


Into the tank is an accessory for the tech generation. Hi. My name is Joe Croft. Im from San Clemente California and Im here asking for $750,000 in exchange for 5% of my company, Gunner Optics Sharks. We live in a world dominated by screens, phones, computers, tablets, P CS. Youre probably staring at a screen 6 to 8 hours a day. At the end of a long day. Your eyes might feel a little bit like this, like theyre covered in sand or perhaps theyre dry and irritated because they take a pounding or maybe youve got a burning sensation from all that harsh blue light or maybe you just have a headache that wont go away. Computer eye strain has become the number one health related office complaint in the United States outstripping carpal tunnel by 40 to 1. Now, all those screens arent going away anytime soon. So, whats the solution? Gunner optics that um I work for a digital age. Our patented optical technology is the only clinically proven solution for relieving all four major symptoms of comput

In [42]:
import nltk  # For sentence segmentation
nltk.download('punkt')
print(nltk.data.path)  # Prints the NLTK data directory path

['C:\\Users\\41222/nltk_data', 'c:\\GitHub_Repos\\SartajBhuvaji\\Resonate\\Streamlit_App\\.venv\\nltk_data', 'c:\\GitHub_Repos\\SartajBhuvaji\\Resonate\\Streamlit_App\\.venv\\share\\nltk_data', 'c:\\GitHub_Repos\\SartajBhuvaji\\Resonate\\Streamlit_App\\.venv\\lib\\nltk_data', 'C:\\Users\\41222\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\41222\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
def chunk_text_preserving_sentences(text, max_chunk_size):
    """Chunks a given text while respecting sentence boundaries and combining
    sentences to meet the chunk size limit.

    Args:
        text (str): The input text to be chunked.
        max_chunk_size (int): The maximum allowed size for each chunk in characters.

    Returns:
        list: A list of text chunks, each respecting sentence boundaries
                and potentially consisting of combined sentences.
    """

    chunks = []
    current_chunk = ""
    sentence_tokenizer = nltk.sent_tokenize

    # Tokenize the text into sentences
    sentences = sentence_tokenizer(text)

    for sentence in sentences:
        # Calculate remaining space in the current chunk
        remaining_space = max_chunk_size - len(current_chunk)

        # If sentence fits within remaining space, add it directly
        if len(sentence) <= remaining_space:
            current_chunk += " " + sentence  # Add space between sentences
        else:
            # Sentence doesn't fit, check if combined with next sentence fits
            combined_sentence = current_chunk + " " + sentence
            if len(combined_sentence) <= max_chunk_size:
                # Combined sentence fits, add and clear current chunk
                chunks.append(combined_sentence)
                current_chunk = ""
            else:
                # Neither sentence nor combined version fit, add current chunk and
                # start a new one with the current sentence
                chunks.append(current_chunk)
                current_chunk = sentence

    # Add any remaining sentences or the final chunk if not empty
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Example usage
max_chunk_size = 1024

chunks = chunk_text_preserving_sentences(transcript_text, max_chunk_size)
display(chunks)


[' Into the tank is an accessory for the tech generation. Hi. My name is Joe Croft. Im from San Clemente California and Im here asking for $750,000 in exchange for 5% of my company, Gunner Optics Sharks. We live in a world dominated by screens, phones, computers, tablets, P CS. Youre probably staring at a screen 6 to 8 hours a day. At the end of a long day. Your eyes might feel a little bit like this, like theyre covered in sand or perhaps theyre dry and irritated because they take a pounding or maybe youve got a burning sensation from all that harsh blue light or maybe you just have a headache that wont go away. Computer eye strain has become the number one health related office complaint in the United States outstripping carpal tunnel by 40 to 1. Now, all those screens arent going away anytime soon. So, whats the solution? Gunner optics that um I work for a digital age. Our patented optical technology is the only clinically proven solution for relieving all four major symptoms of com

In [44]:
# Process each chunk and combine the results
summaries = []
for chunk in chunks:
    summary = summarizer(chunk, )[0]['summary_text']
    summaries.append(summary)


Your max_length is set to 1024, but your input_length is only 253. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=126)
Your max_length is set to 1024, but your input_length is only 241. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=120)
Your max_length is set to 1024, but your input_length is only 248. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=124)
Your max_length is set to 1024, but your input_length is only 244. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_leng

In [45]:
display(summaries)

['Into the tank is an accessory for the tech generation . we live in a world dominated by screens, phones, computers, tablets, P CS . our patented optical technology is the only clinically proven solution for relieving all four major symptoms of computer eye strain .',
 'gunner can help prevent dry eyes, reduce muscular, eye fatigue fight against glare . weve scientifically measured it, we block 65% of the blue light thats emitted from those screens .',
 'red light which is very close to the UV can disrupt your sleep patterns . this year we sold $6.5 million worth of eyewear . we are approaching 60% gross margin, which is great .',
 'we have a patent, weve got battle tested . we have gone toe to toe with some of the largest people in the industry out there . i spent a formative part of my career at Oakley, which dominates sports .',
 '6.5 the year prior we were 8 million, 8 million . and your prior was about the same . we need to get the exposure out .',
 'youve raised $9 million for a