In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import emoji
import numpy as np

# Step 2: Load raw data
df = pd.read_csv("chat_logs.csv")

# Clean the message_text column
df['message_text'] = df['message_text'].fillna('')
df['message_text'] = df['message_text'].str.lower()
df['message_text'] = df['message_text'].apply(lambda x: emoji.replace_emoji(x, replace=''))

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define embedding function
def get_embedding(text):
    return model.encode(text)

# Embed the messages
df['embedding'] = df['message_text'].apply(get_embedding)

# Display the first 8 rows
df.head(8)

# Convert embeddings to NumPy arrays
df['embedding'] = df['embedding'].apply(np.array)

# Aggregate embeddings by conversation_id
aggregated_df = df.groupby('conversation_id')['embedding'].apply(
    lambda x: np.mean(x.tolist(), axis=0)
).reset_index()

# Display final aggregated DataFrame
aggregated_df.head()



Unnamed: 0,conversation_id,embedding
0,C001,"[-0.09594757, -0.03010091, 0.01569206, -0.0161..."
1,C002,"[0.018403964, -0.005646229, 0.050169148, 0.029..."
2,C003,"[-0.048605897, -0.050081164, 0.01672544, -0.03..."
3,C004,"[-0.020637227, 0.0013028742, 0.038435906, 0.03..."
4,C005,"[-0.045043435, -0.017297855, 0.035637997, 0.03..."


In [3]:
# Display the final aggregated DataFrame
aggregated_df.head()

# Optional: Save the output to a CSV file for practice or modeling
aggregated_df.to_csv("processed_logs.csv", index=False)


In [4]:
# Step 10: Expand embeddings into numeric columns for Parquet compatibility
embedding_dim = len(aggregated_df["embedding"].iloc[0])  # Typically 384 for MiniLM
embedding_cols = [f"emb_{i}" for i in range(embedding_dim)]

expanded_embeddings = pd.DataFrame(
    aggregated_df["embedding"].tolist(),
    columns=embedding_cols
)

# Step 11: Combine conversation_id with expanded embeddings
final_df = pd.concat([aggregated_df["conversation_id"], expanded_embeddings], axis=1)

# Step 12: Save the structured data as a Parquet file
final_df.to_parquet("processed_logs.parquet", index=False, engine="fastparquet")

# Step 13: Display confirmation
print("✅ Saved to processed_logs.parquet")
final_df.head()


✅ Saved to processed_logs.parquet


Unnamed: 0,conversation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,C001,-0.095948,-0.030101,0.015692,-0.016129,-0.025472,0.03928,0.012482,-0.019484,0.00916,...,-0.010374,-0.006328,0.014908,-0.015783,0.01339,0.012218,0.028181,-0.012919,0.012862,-0.060929
1,C002,0.018404,-0.005646,0.050169,0.029457,0.039306,-0.046451,0.019339,-0.037919,0.074993,...,0.012952,-0.034305,0.00437,0.009989,0.017085,-0.003597,0.055872,-0.05952,-0.017476,-0.028828
2,C003,-0.048606,-0.050081,0.016725,-0.03318,0.038236,0.041966,-0.020837,-0.073376,-0.057875,...,0.004208,0.092576,0.063713,-0.013102,0.031588,0.055216,0.053282,0.056095,-0.006826,0.043345
3,C004,-0.020637,0.001303,0.038436,0.036454,0.036631,-0.055382,-0.031457,-0.02913,-0.022596,...,0.008629,0.01006,-0.015715,-0.030612,0.037554,0.037731,0.028285,-0.051292,-0.028181,0.011046
4,C005,-0.045043,-0.017298,0.035638,0.032618,0.054167,-0.029294,0.029632,-0.006792,-0.005922,...,0.00747,-0.065273,-0.047404,-0.001722,-0.087354,-0.03507,0.06116,-0.071888,-0.052472,0.009798


In [5]:
# Convert list-style embeddings into NumPy arrays for compatibility with aggregation
df['embedding'] = df['embedding'].apply(np.array)

# Aggregate message-level embeddings by conversation_id using mean pooling
aggregated_df = df.groupby('conversation_id')['embedding'].apply(
    lambda x: np.mean(x.tolist(), axis=0)
).reset_index()

# Expand aggregated embeddings into individual numeric columns for structured output
embedding_dim = len(aggregated_df['embedding'].iloc[0])
embedding_cols = [f'emb_{i}' for i in range(embedding_dim)]
expanded_embeddings = pd.DataFrame(aggregated_df['embedding'].tolist(), columns=embedding_cols)

# Combine conversation_id with expanded embedding columns
final_df = pd.concat([aggregated_df['conversation_id'], expanded_embeddings], axis=1)

# Save the structured data in Parquet format for efficient downstream analysis
final_df.to_parquet("processed_logs.parquet", index=False, engine="fastparquet")

# Preview the final output to confirm successful transformation
final_df.head()


Unnamed: 0,conversation_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,C001,-0.095948,-0.030101,0.015692,-0.016129,-0.025472,0.03928,0.012482,-0.019484,0.00916,...,-0.010374,-0.006328,0.014908,-0.015783,0.01339,0.012218,0.028181,-0.012919,0.012862,-0.060929
1,C002,0.018404,-0.005646,0.050169,0.029457,0.039306,-0.046451,0.019339,-0.037919,0.074993,...,0.012952,-0.034305,0.00437,0.009989,0.017085,-0.003597,0.055872,-0.05952,-0.017476,-0.028828
2,C003,-0.048606,-0.050081,0.016725,-0.03318,0.038236,0.041966,-0.020837,-0.073376,-0.057875,...,0.004208,0.092576,0.063713,-0.013102,0.031588,0.055216,0.053282,0.056095,-0.006826,0.043345
3,C004,-0.020637,0.001303,0.038436,0.036454,0.036631,-0.055382,-0.031457,-0.02913,-0.022596,...,0.008629,0.01006,-0.015715,-0.030612,0.037554,0.037731,0.028285,-0.051292,-0.028181,0.011046
4,C005,-0.045043,-0.017298,0.035638,0.032618,0.054167,-0.029294,0.029632,-0.006792,-0.005922,...,0.00747,-0.065273,-0.047404,-0.001722,-0.087354,-0.03507,0.06116,-0.071888,-0.052472,0.009798
