# Chats Data Exploration

This notebook loads `chats.json`, displays it with pandas, and produces a few simple matplotlib visualizations. It tries to automatically find the file and adapt to common JSON structures (flat lists, nested messages, or line-delimited JSON).

In [None]:
# Imports and notebook setup
%matplotlib inline
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)
plt.style.use('seaborn-v0_8')


In [None]:
# Locate the chats.json file
candidates = [
    Path('..') / 'project' / 'chats.json',
    Path('.') / 'chats.json',
    Path('..') / 'chats.json',
    Path('.') / 'Moptimizer' / 'task_1_UI' / 'project' / 'chats.json'
]
data_path = None
for p in candidates + list(Path('.').glob('**/chats.json')):
    if p.exists():
        data_path = p
        break

if data_path is None:
    raise FileNotFoundError('Could not find chats.json. Adjust the candidates or path as needed.')

print(f'Using chats.json at: {data_path.resolve()}')


In [None]:
# Load JSON into a pandas DataFrame (robust to common formats)
def load_chats_json(path: Path) -> pd.DataFrame:
    try:
        # Try standard JSON (array or object)
        return pd.read_json(path)
    except ValueError:
        pass
    try:
        # Try line-delimited JSON
        return pd.read_json(path, lines=True)
    except ValueError:
        pass
    # Fallback: use json.load then normalize
    with open(path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    return pd.json_normalize(raw)

df = load_chats_json(data_path)
print(f'DataFrame shape: {df.shape}')
df.head()


## Basic Summary

In [None]:
# Quick overview of columns and types
display(df.sample(min(5, len(df))))
display(df.describe(include='all', datetime_is_numeric=True))


## Message-Level Analysis (if applicable)
Attempts to detect a nested messages list and compute simple stats like message length distribution and top senders.

In [None]:
# Try to extract message-level records if a messages-like column exists
msg_df = None
message_keys = ['messages', 'message', 'msgs', 'items', 'data']
for k in message_keys:
    if k in df.columns and df[k].apply(lambda x: isinstance(x, list)).any():
        exploded = df[[k]].explode(k).dropna(subset=[k])
        try:
            msg_df = pd.json_normalize(exploded[k])
        except Exception:
            # If items are primitives, wrap in dict
            msg_df = exploded.rename(columns={k: 'value'})
        break

if msg_df is not None and not msg_df.empty:
    print(f'Message records detected: {len(msg_df)} rows')
    display(msg_df.head())
else:
    print('No nested messages list detected; using top-level DataFrame for plots.')
    msg_df = df.copy()


In [None]:
# Choose a text column heuristically for length analysis
text_candidates = ['content', 'text', 'message', 'body', 'value']
text_col = None
for c in text_candidates:
    if c in msg_df.columns:
        text_col = c
        break

if text_col is None:
    # fallback: first object-type column
    obj_cols = [c for c in msg_df.columns if msg_df[c].dtype == 'object']
    text_col = obj_cols[0] if obj_cols else None

if text_col is not None:
    lengths = msg_df[text_col].astype(str).str.len()
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.hist(lengths, bins=30, color='#4C78A8')
    ax.set_title(f'Message Length Distribution ({text_col})')
    ax.set_xlabel('Length (chars)')
    ax.set_ylabel('Count')
    plt.show()
else:
    print('Could not find a text-like column to analyze lengths.')


In [None]:
# Top senders or roles, if such a column exists
sender_candidates = ['sender', 'role', 'author', 'user', 'from', 'name']
sender_col = None
for c in sender_candidates:
    if c in msg_df.columns:
        sender_col = c
        break

if sender_col is not None:
    top_counts = msg_df[sender_col].astype(str).value_counts().head(10)
    fig, ax = plt.subplots(figsize=(8, 4))
    top_counts.plot(kind='bar', ax=ax, color='#72B7B2')
    ax.set_title(f'Top {len(top_counts)} by {sender_col}')
    ax.set_xlabel(sender_col)
    ax.set_ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print('No sender/role-like column found for bar chart.')
