In [None]:
%pip install datasets
%pip install transformers
%pip install huggingface_hub
%pip install matplotlib seaborn

In [None]:
# Example: Streaming mode for large datasets
# This loads data on-demand without downloading everything first

from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

# Streaming mode - data is loaded on-demand, not all at once
ds_streaming = load_dataset("allenai/WildChat-1M", split="train", streaming=True)

# Take only the first 1k English-language records for exploration (more memory efficient)
ds_english_stream = ds_streaming.filter(lambda row: row['language'] == 'English')

# We'll collect up to (for example) 2000 records, then dedup, and take the first 1000 unique
max_to_scan = 3000  # oversample to ensure uniqueness amongst 1k rows
ds_candidates = [item for item in tqdm(ds_english_stream.take(max_to_scan), total=max_to_scan, desc=f"Scanning {max_to_scan} English samples")]

ds_df = pd.DataFrame(ds_candidates)
num_duplicates = ds_df.duplicated(subset=['conversation']).sum()

# Drop duplicate 'conversation' rows; keep first occurrence
ds_unique = ds_df.drop_duplicates(subset=['conversation'], keep='first').head(1000)

# Convert back to list-of-dicts for further use
ds = ds_unique.to_dict(orient='records')

print("All rows have 'language' = 'English':", (pd.Series([row['language'] for row in ds]) == "English").all())
print(f"Number of duplicate 'conversation' rows removed in scan: {num_duplicates}")
print(f"Size of deduplicated ds: {len(ds)}")
if len(ds) < 1000:
    print("Warning: Less than 1000 unique conversations found in scanned set!")



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import numpy as np

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

# Convert the list of dicts to a DataFrame
df = pd.DataFrame(ds)

# Show all columns (no truncation)
pd.set_option('display.max_columns', None)

# Print the DataFrame columns
print("Columns in the dataset:")
print(df.columns.tolist())
print(f"\nTotal records: {len(df)}")
print("\nFirst few rows:")
print(df.head())

In [None]:
from collections import defaultdict
# Preprocess data for analysis
# Extract hour of day from timestamp
# Since timestamp is already a pandas Timestamp, we can use .dt.hour directly

# Convert timestamp to datetime if it's not already (should already be Timestamp)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract hour using .dt.hour accessor (vectorized, much faster than apply)
df['hour_of_day'] = df['timestamp'].dt.hour

# Logging content and role for every turn in the conversation.
convs = []
for idx, conv in enumerate(df['conversation']):
    trace_dict = defaultdict(dict)
    for turn in conv:
        # trace_dict[turn_identifier] has 2 keys: role_user and role_assistant
        trace_dict[turn['turn_identifier']][turn['role']] = turn['content']
    convs.append(trace_dict)

df['conversation'] = convs
# Logging the first conversation.
print(f"First conversation: {df['conversation'].iloc[0]}")

# Check for missing values
print("Missing values per column:")
print(df[['model', 'turn', 'hour_of_day', 'state', 'country']].isnull().sum())
print(f"\nTotal records: {len(df)}")
print(f"Records with all required fields: {df[['model', 'turn', 'hour_of_day', 'state', 'country']].notna().all(axis=1).sum()}")
unique_hours = df['hour_of_day'].nunique(dropna=True)
print(f"\nNumber of unique 'hour_of_day' values: {unique_hours}")
print(f"Unique values: {sorted(df['hour_of_day'].dropna().unique())}")


In [None]:
# Create distribution visualizations for each facet
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Distribution Analysis Across Facets', fontsize=16, fontweight='bold')

# 1. Model distribution
ax1 = axes[0, 0]
model_counts = df['model'].value_counts()
if len(model_counts) > 20:
    # Show top 20 if too many models
    model_counts = model_counts.head(20)
    ax1.barh(range(len(model_counts)), model_counts.values)
    ax1.set_yticks(range(len(model_counts)))
    ax1.set_yticklabels(model_counts.index)
    ax1.set_xlabel('Count')
    ax1.set_title(f'Model Distribution (Top 20)')
else:
    ax1.barh(range(len(model_counts)), model_counts.values)
    ax1.set_yticks(range(len(model_counts)))
    ax1.set_yticklabels(model_counts.index)
    ax1.set_xlabel('Count')
    ax1.set_title('Model Distribution')
ax1.invert_yaxis()

# 2. Turn (#turns) distribution
ax2 = axes[0, 1]
turn_counts = df['turn'].value_counts().sort_index()
ax2.bar(turn_counts.index, turn_counts.values, edgecolor='black', alpha=0.7)
ax2.set_xlabel('Number of Turns')
ax2.set_ylabel('Count')
ax2.set_title('Turn Distribution')
ax2.set_xticks(range(0, int(turn_counts.index.max()) + 1, max(1, int(turn_counts.index.max()) // 10)))

# 3. Time of day (hour) distribution
ax3 = axes[0, 2]
hour_counts = df['hour_of_day'].dropna().value_counts().sort_index()
ax3.bar(hour_counts.index, hour_counts.values, edgecolor='black', alpha=0.7, color='skyblue')
ax3.set_xlabel('Hour of Day')
ax3.set_ylabel('Count')
ax3.set_title('Timestamp Distribution (Hour of Day)')
ax3.set_xticks(range(0, 24, 2))
ax3.set_xlim(-0.5, 23.5)

# 4. State distribution
ax4 = axes[1, 0]
state_counts = df['state'].dropna().value_counts()
if len(state_counts) > 15:
    # Show top 15 if too many states
    state_counts = state_counts.head(15)
    ax4.barh(range(len(state_counts)), state_counts.values)
    ax4.set_yticks(range(len(state_counts)))
    ax4.set_yticklabels(state_counts.index)
    ax4.set_xlabel('Count')
    ax4.set_title(f'State Distribution (Top 15)')
else:
    ax4.barh(range(len(state_counts)), state_counts.values)
    ax4.set_yticks(range(len(state_counts)))
    ax4.set_yticklabels(state_counts.index)
    ax4.set_xlabel('Count')
    ax4.set_title('State Distribution')
ax4.invert_yaxis()

# 5. Country distribution
ax5 = axes[1, 1]
country_counts = df['country'].dropna().value_counts()
if len(country_counts) > 15:
    # Show top 15 if too many countries
    country_counts = country_counts.head(15)
    ax5.barh(range(len(country_counts)), country_counts.values)
    ax5.set_yticks(range(len(country_counts)))
    ax5.set_yticklabels(country_counts.index)
    ax5.set_xlabel('Count')
    ax5.set_title(f'Country Distribution (Top 15)')
else:
    ax5.barh(range(len(country_counts)), country_counts.values)
    ax5.set_yticks(range(len(country_counts)))
    ax5.set_yticklabels(country_counts.index)
    ax5.set_xlabel('Count')
    ax5.set_title('Country Distribution')
ax5.invert_yaxis()

# 6. Summary statistics
ax6 = axes[1, 2]
ax6.axis('off')
summary_text = f"""
Summary Statistics:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Total Records: {len(df):,}

Model: {df['model'].nunique()} unique models
Turn: {df['turn'].min():.0f} - {df['turn'].max():.0f} (mean: {df['turn'].mean():.1f})
Hour: {df['hour_of_day'].min():.0f}:00 - {df['hour_of_day'].max():.0f}:00
State: {df['state'].nunique()} unique states
Country: {df['country'].nunique()} unique countries

Missing Values:
  - Model: {df['model'].isnull().sum()}
  - Turn: {df['turn'].isnull().sum()}
  - Hour: {df['hour_of_day'].isnull().sum()}
  - State: {df['state'].isnull().sum()}
  - Country: {df['country'].isnull().sum()}
"""
ax6.text(0.1, 0.5, summary_text, fontsize=11, family='monospace',
         verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

# Print top values for each facet
print("\n" + "="*60)
print("TOP VALUES FOR EACH FACET")
print("="*60)
print("\nüìä Top 10 Models:")
print(df['model'].value_counts().head(10))
print("\nüìä Top 10 Turn Values:")
print(df['turn'].value_counts().head(10))
print("\nüìä Hour Distribution (Top 10):")
print(df['hour_of_day'].value_counts().head(10))
print("\nüìä Top 10 States:")
print(df['state'].value_counts().head(10))
print("\nüìä Top 10 Countries:")
print(df['country'].value_counts().head(10))


In [None]:
df.head()

In [None]:
# Create a new dataframe with the specified columns and standardized column names
selected_columns = {
    'model': 'Model',
    'conversation': 'Conversation',
    'language': 'Language',
    'toxic': 'Toxic',
    'state': 'State',
    'country': 'Country',
    'hour_of_day': 'Hour of Day'
}
df_selected = df[list(selected_columns.keys())].rename(columns=selected_columns)


In [None]:
# Creating a new column for the LLM-generated topic/facet and its embedding
df_selected['Topic'] = ''
df_selected['Topic_Embedding'] = ''

In [None]:
df_selected.head()

In [None]:
df_selected.to_csv('selected_conversations.csv', index=False)