In [6]:
import numpy as np
import pandas as pd

print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)


Numpy version: 2.3.3
Pandas version: 2.3.2


In [2]:
import sys
print(sys.executable)


/home/sachin/Desktop/AI-Report-Gen/my_py311_env/bin/python


In [7]:
import pandas as pd

df = pd.read_csv('sents_dataset.csv')
print(df.columns.tolist())

['sentence', 'embedding', 'filler_count', 'filler_similarity_score', 'word_count', 'char_count', 'avg_word_length', 'stopword_ratio', 'starts_with_filler', 'interjection_count', 'adverb_count', 'label']


In [8]:
# Filter rows with label == 0 and take the first 10
to_label = df[df['label'] == 0].copy().reset_index(drop=True).head(10)

# Initialize manual_label column if not present
if 'manual_label' not in to_label.columns:
    to_label['manual_label'] = None

# Start manual labeling loop
for idx, row in to_label.iterrows():
    if pd.notnull(row['manual_label']):
        continue  # Skip already labeled

    print(f"\n🔢 Record {idx + 1} / {len(to_label)}")
    print(f"📝 Sentence: {row['sentence']}")
    print(f"🧮 Features — Filler Count: {row['filler_count']}, Filler Similarity: {row['filler_similarity_score']:.3f}, Stopword Ratio: {row['stopword_ratio']:.2f}")

    while True:
        user_input = input("➡️ Enter label (0 = Not Filler, 1 = Filler, s = skip, q = quit): ").strip()
        if user_input == 'q':
            print("✅ Quitting...")
            break
        elif user_input == 's':
            print("⏭️ Skipped")
            break
        elif user_input in ['0', '1']:
            to_label.at[idx, 'manual_label'] = int(user_input)
            print(f"✅ Saved label: {user_input}")
            break
        else:
            print("❌ Invalid input. Enter 0, 1, s, or q.")

    # Save progress after each entry
    to_label.to_csv("manually_labeled_top10.csv", index=False)

    if user_input == 'q':
        break

print("\n🏁 Labeling session complete.")



🔢 Record 1 / 10
📝 Sentence: It's I love it or hate it, right?
🧮 Features — Filler Count: 1, Filler Similarity: 0.140, Stopword Ratio: 0.50
✅ Saved label: 1

🔢 Record 2 / 10
📝 Sentence: Like I see a lot of competitors of vendors using it.
🧮 Features — Filler Count: 1, Filler Similarity: 0.149, Stopword Ratio: 0.36
✅ Saved label: 0

🔢 Record 3 / 10
📝 Sentence: I was going in the direction here, which was thinking you know a lot of and then I called them key iterations the sort of specific things.
🧮 Features — Filler Count: 3, Filler Similarity: 0.202, Stopword Ratio: 0.56
✅ Saved label: 0

🔢 Record 4 / 10
📝 Sentence: Well, it was just me just as you were offensive.
🧮 Features — Filler Count: 1, Filler Similarity: 0.264, Stopword Ratio: 0.80
✅ Saved label: 0

🔢 Record 5 / 10
📝 Sentence: And then, for, if the vendor builds, like, builds themselves as a platform, then we should specifically call out that they have zero.
🧮 Features — Filler Count: 1, Filler Similarity: 0.020, Stopword Ratio

In [4]:
import pandas as pd
import requests
import time
import os

# === CONFIG ===
MODEL_NAME = "llama3.2:3b"
CHUNK_SIZE = 10
INPUT_FILE = "sents_dataset.csv"
OUTPUT_FILE = "llm_relabelled.csv"
LABEL_COLUMN = "llm_label"

# === LOAD DATA ===
df = pd.read_csv(INPUT_FILE)
assert 'sentence' in df.columns, "❌ 'sentence' column missing in input file"

# Filter only label == 0 (you want to relabel these)
to_label = df[df['label'] == 0].copy()

# Optional: warn or deduplicate if sentence duplicates exist
if to_label['sentence'].duplicated().any():
    print("⚠️ Warning: Duplicate sentences found in input. Consider deduplicating.")
    # to_label.drop_duplicates(subset='sentence', inplace=True)

# Exit early if nothing to label
if to_label.empty:
    print("✅ No sentences to relabel (label == 0). Exiting.")
    exit()

# Load already labeled records if output file exists
if os.path.exists(OUTPUT_FILE):
    labeled_df = pd.read_csv(OUTPUT_FILE)

    # Merge to ensure progress resumes
    to_label = to_label.merge(
        labeled_df[['sentence', LABEL_COLUMN]],
        on='sentence',
        how='left',
        suffixes=('', '_existing')
    )

    # Fill from existing if available
    to_label[LABEL_COLUMN] = to_label[LABEL_COLUMN].combine_first(to_label[LABEL_COLUMN + '_existing'])
    to_label.drop(columns=[LABEL_COLUMN + '_existing'], inplace=True)

# Initialize label column if not present
if LABEL_COLUMN not in to_label.columns:
    to_label[LABEL_COLUMN] = None

# Ensure label column is numeric
to_label[LABEL_COLUMN] = pd.to_numeric(to_label[LABEL_COLUMN], errors='coerce')




In [5]:
# === LLM Labeling Function ===
def label_with_llm(sentence):
    prompt = f"""
You are a strict sentence classifier.

Rules:
- Return `1` if the sentence is a filler: vague, hesitant, or not meaningful alone
- Return `0` if it's meaningful: clear, self-contained, specific

Examples:

Sentence: "Like, I guess it could work."
Label: 1

Sentence: "They’re moving the release to Q4 next year."
Label: 0

Sentence: "Well, you know, it’s kind of tricky."
Label: 1

Sentence: "The competitors are launching in Asia next month."
Label: 0

Now classify:

Sentence: "{sentence}"
Reply with only: 0 or 1
"""

    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": MODEL_NAME,
                "prompt": prompt,
                "temperature": 0.0,
                "stream": False,
                "max_tokens": 1  # Optional but helps truncate unwanted explanations
            }
        )
        response.raise_for_status()
        result = response.json().get("response", "").strip()

        # Strict match
        if result in ['0', '1']:
            return int(result)

        # Fallback: try extracting 0 or 1 from any part of the response
        for token in result.split():
            if token.strip() in ['0', '1']:
                return int(token.strip())

        print(f"⚠️ Unexpected output: {result}")
        return None

    except Exception as e:
        print(f"❌ Error: {e}")
        return None


In [7]:
failed_labels = []
# Filter rows that are not labeled yet
unlabeled = to_label[to_label[LABEL_COLUMN].isna()].copy()

# Start labeling
for i, (idx, row) in enumerate(unlabeled.iterrows(), start=1):
    sentence = row['sentence']
    print(f"\n🔢 Record {i}/{len(unlabeled)}")
    print(f"📝 Sentence: {sentence}")

    predicted_label = label_with_llm(sentence)
    if predicted_label is None:
        print("❌ Skipping due to error or invalid response.")
        failed_labels.append((idx, sentence))
        continue

    print(f"✅ LLM Label: {predicted_label}")
    to_label.at[idx, LABEL_COLUMN] = predicted_label

    # Optional delay
    time.sleep(0.3)

    # Save every CHUNK_SIZE
    if i % CHUNK_SIZE == 0 or i == len(unlabeled):
        print(f"💾 Saving after {i} records...")
        to_label[['sentence', LABEL_COLUMN]].to_csv(OUTPUT_FILE, index=False)

# Log failures
if failed_labels:
    print(f"\n⚠️ {len(failed_labels)} sentences failed labeling. Logging to 'failed_labels.csv'")
    pd.DataFrame(failed_labels, columns=['idx', 'sentence']).to_csv("failed_labels.csv", index=False)



🔢 Record 1/596
📝 Sentence: It's I love it or hate it, right?
✅ LLM Label: 1

🔢 Record 2/596
📝 Sentence: Like I see a lot of competitors of vendors using it.
✅ LLM Label: 0

🔢 Record 3/596
📝 Sentence: I was going in the direction here, which was thinking you know a lot of and then I called them key iterations the sort of specific things.
✅ LLM Label: 0

🔢 Record 4/596
📝 Sentence: Well, it was just me just as you were offensive.
✅ LLM Label: 1

🔢 Record 5/596
📝 Sentence: And then, for, if the vendor builds, like, builds themselves as a platform, then we should specifically call out that they have zero.
✅ LLM Label: 0

🔢 Record 6/596
📝 Sentence: For the purposes of front end working on it because front end was going to work on it without a mockup.
✅ LLM Label: 0

🔢 Record 7/596
📝 Sentence: So it may end up Cindy being you and I just picked the one that we want to do and then they can back us up.
✅ LLM Label: 0

🔢 Record 8/596
📝 Sentence: Can't believe this is always a little bit tough an

New Dataset

In [23]:
import pandas as pd

# Step 1: Load the labeled CSV
df = pd.read_csv("llm_relabelled.csv")

# Preview the first few rows
print(df.head())


                                            sentence  llm_label
0                  It's I love it or hate it, right?        1.0
1  Like I see a lot of competitors of vendors usi...        0.0
2  I was going in the direction here, which was t...        0.0
3   Well, it was just me just as you were offensive.        1.0
4  And then, for, if the vendor builds, like, bui...        0.0


In [24]:
df = df.rename(columns={'llm_label': 'label'})


In [25]:
fillers = pd.read_csv("fillers.txt", header=None, names=['sentence'])

fillers['label'] = 1

df = pd.concat([df, fillers], ignore_index=True)



In [27]:
df = df.sample(frac=1).reset_index(drop=True)


In [29]:
from sentence_transformers import SentenceTransformer
device = 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2',device = device)
embeddings = model.encode(df['sentence'].tolist(), show_progress_bar=True)

df['embedding'] = embeddings.tolist()

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 29/29 [00:03<00:00,  8.13it/s]


In [31]:
filler_words = [
    "um", "uh", "like", "you know", "i mean", "so", "well", "actually", "basically",
    "literally", "sort of", "kind of", "you see", "right", "hmm"
]
def count_filler_words(sentence, filler_list):
    sentence = sentence.lower()
    count = 0
    for filler in filler_list:
        count += sentence.count(filler)
    return count
df['filler_count'] = df['sentence'].apply(lambda x: count_filler_words(x, filler_words))
df

Unnamed: 0,sentence,label,embedding,filler_count
0,"No, sorry.",1.0,"[-0.07825806736946106, 0.004383323714137077, 0...",1
1,Like Windows 11 or anybody who's rolling stuff...,0.0,"[-0.04591212421655655, -0.019967254251241684, ...",1
2,"Yadda, yadda, yadda.",1.0,"[0.04486488550901413, 0.06306871771812439, 0.0...",0
3,"So it looks like maybe a platform on reinvent,...",1.0,"[-0.10639108717441559, -0.09424474835395813, -...",2
4,I was really I don't did you all did anybody m...,1.0,"[0.031144753098487854, -0.027020610868930817, ...",0
...,...,...,...,...
900,"So that's, it's not going to be as hard to upd...",1.0,"[-0.11802166700363159, -0.03177763894200325, 0...",2
901,This one I couldn't pick.,1.0,"[-0.1180206760764122, -0.0005945406155660748, ...",0
902,It's being less good.,1.0,"[0.07134339958429337, -0.012571004219353199, 0...",0
903,"You know, it's pretty cool.",0.0,"[0.043975986540317535, -0.04274951294064522, 0...",1


In [34]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
filler_phrases = [
    "um", "uh", "like", "you know", "i mean", "so", "well", 
    "actually", "basically", "literally", "sort of", "kind of"
]

filler_reference_text = " ".join(filler_phrases)
filler_embedding = model.encode(filler_reference_text)
sentence_embeddings = np.array(df['embedding'].tolist())

similarities = cosine_similarity(sentence_embeddings, [filler_embedding]).flatten()

df['filler_similarity_score'] = similarities
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
df['char_count'] = df['sentence'].apply(len)
df['avg_word_length'] = df['char_count'] / df['word_count']


In [35]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['stopword_ratio'] = df['sentence'].apply(
    lambda x: sum(1 for word in x.lower().split() if word in stop_words) / len(x.split())
)
filler_starts = ('so', 'well', 'actually', 'like', 'i mean', 'you know')

df['starts_with_filler'] = df['sentence'].str.lower().apply(
    lambda x: any(x.startswith(f) for f in filler_starts)
).astype(int)


In [36]:
import spacy
nlp = spacy.load("en_core_web_sm")  # Lightweight English model

def count_pos_tags(sentence, pos_tag):
    doc = nlp(sentence)
    return sum(1 for token in doc if token.pos_ == pos_tag)


In [37]:
df['interjection_count'] = df['sentence'].apply(lambda x: count_pos_tags(x, 'INTJ'))
df['adverb_count'] = df['sentence'].apply(lambda x: count_pos_tags(x, 'ADV'))
cols = [col for col in df.columns if col != 'label'] + ['label']
df = df[cols]
df

Unnamed: 0,sentence,embedding,filler_count,filler_similarity_score,word_count,char_count,avg_word_length,stopword_ratio,starts_with_filler,interjection_count,adverb_count,label
0,"No, sorry.","[-0.07825806736946106, 0.004383323714137077, 0...",1,-0.038258,2,10,5.000000,0.000000,0,1,0,1.0
1,Like Windows 11 or anybody who's rolling stuff...,"[-0.04591212421655655, -0.019967254251241684, ...",1,0.108919,33,174,5.272727,0.424242,1,0,3,0.0
2,"Yadda, yadda, yadda.","[0.04486488550901413, 0.06306871771812439, 0.0...",0,0.142957,3,20,6.666667,0.000000,0,0,0,1.0
3,"So it looks like maybe a platform on reinvent,...","[-0.10639108717441559, -0.09424474835395813, -...",2,0.124158,17,89,5.235294,0.411765,1,0,2,1.0
4,I was really I don't did you all did anybody m...,"[0.031144753098487854, -0.027020610868930817, ...",0,0.121458,11,50,4.545455,0.727273,0,0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
900,"So that's, it's not going to be as hard to upd...","[-0.11802166700363159, -0.03177763894200325, 0...",2,0.091977,32,165,5.156250,0.562500,1,0,5,1.0
901,This one I couldn't pick.,"[-0.1180206760764122, -0.0005945406155660748, ...",0,-0.033761,5,25,5.000000,0.600000,0,0,0,1.0
902,It's being less good.,"[0.07134339958429337, -0.012571004219353199, 0...",0,0.079306,4,21,5.250000,0.500000,0,0,1,1.0
903,"You know, it's pretty cool.","[0.043975986540317535, -0.04274951294064522, 0...",1,0.269240,5,27,5.400000,0.400000,1,0,1,0.0


In [38]:
df_orig = pd.read_csv("sents_dataset.csv")
df_combined = pd.concat([df_orig, df], ignore_index=True)
df_combined = df_combined.drop_duplicates(subset='sentence', keep='first').reset_index(drop=True)



In [39]:
print(f"Original dataset rows: {len(df_orig)}")
print(f"Current DataFrame rows: {len(df)}")
print(f"Combined rows after removing duplicates: {len(df_combined)}")


Original dataset rows: 1497
Current DataFrame rows: 905
Combined rows after removing duplicates: 1493


In [41]:
df_combined['label'] = df_combined['label'].astype(int)

ML MODELS



In [48]:
df_ml = df_combined_final
df_ml = df.drop(columns=['sentence'])

In [49]:
print(df_ml.columns)

Index(['embedding', 'filler_count', 'filler_similarity_score', 'word_count',
       'char_count', 'avg_word_length', 'stopword_ratio', 'starts_with_filler',
       'interjection_count', 'adverb_count', 'label'],
      dtype='object')


In [52]:
X_embed = np.array(df_ml['embedding'].tolist())
X_data = df_ml.drop(columns = ['embedding','label']).values
X = np.hstack([X_embed,X_data])
Y = df_ml['label'].values

In [54]:
print("X_embed shape:", X_embed.shape)  # Should be (854, 384)
print("X_data shape:", X_data.shape)    # Should be (854, <number_of_handcrafted_features>)


X_embed shape: (905, 384)
X_data shape: (905, 9)


LLM's Model processing

In [None]:
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap  # Move forward but keep overlap
    return chunks


In [None]:
chunks = split_text_into_chunks(text, chunk_size=1000, overlap=200)

# Optional: print a few chunks
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print()


--- Chunk 1 ---
can record. And we don't have a ton of items to get to. And I might be able to do one that might be fun if we have a little bit of time. So corporate events, they, I think I saw a little, I put this in Slack and I saw a little bit of kind of noise around it, which was good. The nutshell here is as we've kind of restructured and tried different things. The event support that we need isn't as nailed down as it needs to be. So the current tactic that we're going with is go-to-market team, signs up and kind of sponsors that event. So you support as a PMM, your campaign manager does the campaigns for that event, etc, etc. I don't see anyone in the, maybe there are comments in the issue. I don't see the header updated yet. I thought we had in Slack sort of farm to each one of them out. So I guess the next, so it looks like tie put in some folks. It looks like this looks good so that let's see needs support from GTM teams. So I guess the ask would be to work with your GTM team

In [None]:
import requests

def summarize_chunk(chunk, model="llama3.2:3b"):
    prompt = f"Summarize the following meeting transcript:\n\n{chunk}"
    
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False
            }
        )
        response.raise_for_status()
        return response.json().get("response", "⚠️ No response received.")
    
    except Exception as e:
        return f"❌ Error: {str(e)}"


In [None]:
summaries = []

for idx, chunk in enumerate(chunks):
    print(f"--- Summarizing Chunk {idx + 1} ---")
    summary = summarize_chunk(chunk)
    print(summary)
    print()
    summaries.append(summary)


--- Summarizing Chunk 1 ---
The meeting transcript appears to be a summary of a discussion regarding corporate events and event support. The key points are:

* There is still uncertainty about the specific event support needed.
* The current approach is for the go-to-market team to sponsor and sign up for events, with other teams providing support (e.g., campaign management).
* It seems that there was an attempt to assign a specific task to each person involved, but this hasn't been fully completed.
* The next step is for the event support to be provided by the GTM teams, which would require collaboration and support from those teams.

Overall, it seems like the team is still in the process of figuring out the best approach for corporate events and gathering support for the necessary tasks.

--- Summarizing Chunk 2 ---
This appears to be a meeting transcript about GTM (Growth Team Management) teams and their support for a project. Here's a summary:

- The speaker is discussing next ste

In [None]:
import json

# Create a list of chunk summary entries
data = []

for i, summary in enumerate(summaries, start=1):
    data.append({
        "chunk": f"Chunk {i}",
        "summary": summary
    })

# Save to JSON file
with open("chunk_summaries.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("✅ Summaries saved to 'chunk_summaries.json'")


✅ Summaries saved to 'chunk_summaries.json'


Action Items

In [None]:
def extract_action_items(summaries):
    context = "\n\n".join([f"{s['chunk']}: {s['summary']}" for s in summaries])
    
    prompt = (
        "From the following meeting summaries, extract a list of clear, actionable tasks "
        "or to-do items discussed. Include who is responsible if mentioned.\n\n"
        f"{context}\n\n"
        "List the tasks in bullet points or numbered list."
    )

    return summarize_chunk(prompt)

In [None]:
import json

# Load summaries
with open("chunk_summaries.json", "r", encoding="utf-8") as f:
    summaries = json.load(f)

# Extract action items
action_items = extract_action_items(summaries)

print("📌 Action Items / TODOs:")
print(action_items)


📌 Action Items / TODOs:
Here are the tasks mentioned in the transcript:

1. **Create an industry resource page** comparing DevOps tools
2. **Design a new website** with a fresh design concept
3. **Develop a software factory concept**
4. **Brainstorm slogans or taglines for GitLab** (including "secure the software factory" phrase)
5. **Present ideas and work on messaging framework** for a blog post
6. **Create marketing materials** (e.g., slogans, taglines) for GitLab
7. **Develop a new product or idea**
8. **Participate in a NASCAR partnership discussion** with GitLab
9. **Collaborate on bike racing analogies and speed optimization strategies**
10. **Plan fun activities and outings** (including movie nights)
11. **Prioritize tasks and focus on speed over risk**

Note that some of these tasks may be ongoing or iterative, rather than single-point tasks. Additionally, the transcript does not provide a comprehensive list of all tasks, but rather highlights specific discussions and ideas re

In [None]:
with open("action_items.txt", "w", encoding="utf-8") as f:
    f.write(action_items)
