In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.decomposition import TruncatedSVD

In [2]:
# Sample meeting transcripts
meeting_transcripts = pd.read_csv("test.csv")["text"]

# Preprocess the text (you may need more advanced preprocessing based on your data)
# Here, we are using a simple approach without stemming or lemmatization
processed_transcripts = [" ".join(text.split()) for text in meeting_transcripts]

In [3]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_transcripts)

In [4]:
# Mini-Batch K-Means Clustering
num_clusters = 3  # You can adjust this based on your data or use techniques to find optimal clusters
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)

In [5]:
# Add the cluster labels to the original data
data = {"Transcript": meeting_transcripts, "Cluster": clusters}
df = pd.DataFrame(data)

In [6]:
# Example of querying using natural language
user_query = "What was discussed in the project meeting?"

# Remove TruncatedSVD and use the original TF-IDF matrix for the query vector
query_vector = tfidf_vectorizer.transform([user_query])
from sklearn.preprocessing import normalize

# Transform query vector using TruncatedSVD (LSA) for dimensionality reduction
# Reduce the number of components in TruncatedSVD

# Normalize TF-IDF matrix
tfidf_matrix_normalized = normalize(tfidf_matrix)

# Apply TruncatedSVD
lsa = TruncatedSVD(n_components=min(100, tfidf_matrix.shape[1]), random_state=42)
query_vector_lsa = lsa.fit_transform(query_vector)

# Calculate cosine similarity between query and cluster centroids
cluster_centers = kmeans.cluster_centers_
# similarities = linear_kernel(query_vector_lsa, cluster_centers)

# Calculate cosine similarity between query and cluster centroids
similarities = linear_kernel(query_vector, cluster_centers)

# Get the most similar cluster
most_similar_cluster = similarities.argmax()

# Retrieve transcripts from the most similar cluster
relevant_transcripts = df[df["Cluster"] == most_similar_cluster]["Transcript"]

# Display results
print("Query:", user_query)
print("Most relevant cluster:", most_similar_cluster)
print("Transcripts in the most relevant cluster:")
print(relevant_transcripts)

Query: What was discussed in the project meeting?
Most relevant cluster: 0
Transcripts in the most relevant cluster:
0                             Yeah, the universal one.
2         But, but 25 you need a lot of neat features.
3    Yeah. Yeah. Uh cause I mean, what €25 thats ab...
Name: Transcript, dtype: object


  self.explained_variance_ratio_ = exp_var / full_var


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Assume new_transcript is the new meeting transcript
new_transcript = "Discussion on the upcoming project deadline."

# Preprocess the new transcript
processed_new_transcript = " ".join(new_transcript.split())

# Vectorize the new transcript using the same TF-IDF vectorizer
new_transcript_vector = tfidf_vectorizer.transform([processed_new_transcript])

# Transform the new transcript vector using TruncatedSVD if needed
new_transcript_vector_lsa = lsa.transform(new_transcript_vector)

# Calculate cosine similarity between the new transcript and existing cluster centroids
similarities_to_clusters = cosine_similarity(new_transcript_vector_lsa, cluster_centers)

# Set a similarity threshold below which a new cluster will be created
similarity_threshold = 0.7

# Find the cluster with the highest similarity
most_similar_cluster_index = similarities_to_clusters.argmax()
highest_similarity = similarities_to_clusters[0, most_similar_cluster_index]

# Check if the highest similarity is below the threshold
if highest_similarity < similarity_threshold:
    # Create a new cluster
    new_cluster_index = len(cluster_centers)
    cluster_centers = np.vstack([cluster_centers, new_transcript_vector_lsa])
    df = df.append({'Transcript': new_transcript, 'Cluster': new_cluster_index}, ignore_index=True)
    print(f"New cluster created for the transcript: {new_transcript}")
else:
    # Add the new transcript to the most similar existing cluster
    df = df.append({'Transcript': new_transcript, 'Cluster': most_similar_cluster_index}, ignore_index=True)
    print(f"Added to existing cluster: {most_similar_cluster_index}")

# Display updated clusters
print("Updated Clusters:")
print(df)
