In [1]:
!pip install sentence-transformers numpy pandas




In [2]:
 # Importing
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import os
from datetime import datetime
from typing import Dict, List, Optional, Tuple
import pandas as pd
from pathlib import Path

# Loading the embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("✓ Embedding model loaded")

# let be Creating some  directory structure
os.makedirs('data/profiles', exist_ok=True)
os.makedirs('data/logs', exist_ok=True)
os.makedirs('data/cold_start', exist_ok=True)
print("✓ Directory structure created")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Embedding model loaded
✓ Directory structure created


In [3]:
# CELL 3: User Profile Management
class UserProfile:
    """Manages user preference profiles with EWA updates."""

    def __init__(self, user_id: str, embedding_dim: int = 768):
        self.user_id = user_id
        self.embedding_dim = embedding_dim
        self.profile_vector = None
        self.history = []
        self.metadata = {
            'created_at': datetime.now().isoformat(),
            'update_count': 0,
            'cold_start': True
        }

    def initialize_from_samples(self, sample_texts: List[str], embedding_model):
        """Initialize profile from sample texts (cold-start)."""
        if not sample_texts:
            # Initialize with neutral vector
            self.profile_vector = np.zeros(self.embedding_dim)
            return

        embeddings = embedding_model.encode(sample_texts)
        self.profile_vector = np.mean(embeddings, axis=0)
        # Normalize
        norm = np.linalg.norm(self.profile_vector)
        if norm > 0:
            self.profile_vector = self.profile_vector / norm

        self.metadata['cold_start'] = False
        self.metadata['init_samples'] = len(sample_texts)
        print(f"✓ Profile initialized from {len(sample_texts)} samples")

    def update_with_feedback(
        self,
        text: str,
        embedding_model,
        rating: float = 1.0,
        alpha: float = 0.2
    ):
        """
        Update profile with user feedback using EWA.

        Args:
            text: The text the user interacted with
            embedding_model: SentenceTransformer model
            rating: User rating (0-1, where 1=liked, 0=disliked)
            alpha: Learning rate for EWA
        """
        # Compute embedding for new text
        new_embedding = embedding_model.encode(text)

        # If no profile yet, initialize with this
        if self.profile_vector is None:
            self.profile_vector = new_embedding / np.linalg.norm(new_embedding)
            self.metadata['cold_start'] = False
        else:
            # EWA update, weighted by rating
            effective_alpha = alpha * rating  # Reducing alpha for some low ratings
            self.profile_vector = (
                (1 - effective_alpha) * self.profile_vector +
                effective_alpha * new_embedding
            )
            # Re-normalize
            norm = np.linalg.norm(self.profile_vector)
            if norm > 0:
                self.profile_vector = self.profile_vector / norm

        # Record in history
        self.history.append({
            'timestamp': datetime.now().isoformat(),
            'text': text,
            'rating': rating,
            'alpha': alpha
        })
        self.metadata['update_count'] += 1
        self.metadata['last_update'] = datetime.now().isoformat()

    def save(self, filepath: Optional[str] = None):
        """Save profile to JSON file."""
        if filepath is None:
            filepath = f'data/profiles/profile_{self.user_id}.json'

        data = {
            'user_id': self.user_id,
            'profile_vector': self.profile_vector.tolist() if self.profile_vector is not None else None,
            'metadata': self.metadata,
            'history': self.history[-50:]  # Keeping the last 50 interactions only
        }

        with open(filepath, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"✓ Profile saved to {filepath}")

    @classmethod
    def load(cls, user_id: str, filepath: Optional[str] = None):
        """Load profile from JSON file."""
        if filepath is None:
            filepath = f'data/profiles/profile_{user_id}.json'

        if not os.path.exists(filepath):
            print(f"No existing profile found for {user_id}, creating new")
            return cls(user_id)

        with open(filepath, 'r') as f:
            data = json.load(f)

        profile = cls(user_id)
        if data['profile_vector']:
            profile.profile_vector = np.array(data['profile_vector'])
        profile.metadata = data['metadata']
        profile.history = data.get('history', [])

        print(f"✓ Loaded profile for {user_id}")
        return profile

# Test profile creation
print("\n Testing UserProfile Class ")
test_profile = UserProfile('test_user_001')
print(f"Created profile for {test_profile.user_id}")
print(f"Metadata: {test_profile.metadata}")


 Testing UserProfile Class 
Created profile for test_user_001
Metadata: {'created_at': '2025-11-03T20:40:24.309048', 'update_count': 0, 'cold_start': True}


In [4]:
# Defined cold-start seed samples for different themes
COLD_START_SAMPLES = {
    'nature': [
        "The gentle breeze rustles through autumn leaves",
        "Mountains stand silent beneath the morning sky",
        "Rivers flow endlessly toward the distant sea"
    ],
    'love': [
        "Your smile brightens even the darkest days",
        "Two hearts beating as one in perfect harmony",
        "Love blooms like flowers in the spring"
    ],
    'melancholy': [
        "Empty streets echo with memories of the past",
        "Shadows grow longer as the day fades away",
        "Silent tears fall like rain upon my heart"
    ],
    'joy': [
        "Laughter rings out across the sunlit meadow",
        "Dancing freely under stars that shine so bright",
        "Every moment sparkles with pure delight"
    ]
}

def save_cold_start_samples():
    """Save cold-start samples to file."""
    filepath = 'data/cold_start/theme_samples.json'
    with open(filepath, 'w') as f:
        json.dump(COLD_START_SAMPLES, f, indent=2)
    print(f"✓ Saved cold-start samples to {filepath}")

save_cold_start_samples()

def initialize_profile_with_theme(
    user_id: str,
    theme: str,
    embedding_model
) -> UserProfile:
    """
    Initialize a new user profile with theme-based cold-start.

    Args:
        user_id: User identifier
        theme: Theme category (nature, love, melancholy, joy)
        embedding_model: SentenceTransformer model

    Returns:
        Initialized UserProfile
    """
    profile = UserProfile(user_id)

    if theme in COLD_START_SAMPLES:
        samples = COLD_START_SAMPLES[theme]
        profile.initialize_from_samples(samples, embedding_model)
        profile.metadata['cold_start_theme'] = theme
    else:
        print(f"Warning: Unknown theme '{theme}', using neutral initialization")

    return profile

# Test cold-start initialization
print("\n Testing Cold-Start Initialization ")
for theme in ['nature', 'love']:
    print(f"\n{theme.upper()}:")
    test_prof = initialize_profile_with_theme(
        f'user_{theme}',
        theme,
        embedding_model
    )
    print(f"  Profile vector shape: {test_prof.profile_vector.shape}")
    print(f"  Cold-start: {test_prof.metadata['cold_start']}")

✓ Saved cold-start samples to data/cold_start/theme_samples.json

 Testing Cold-Start Initialization 

NATURE:
✓ Profile initialized from 3 samples
  Profile vector shape: (768,)
  Cold-start: False

LOVE:
✓ Profile initialized from 3 samples
  Profile vector shape: (768,)
  Cold-start: False


In [5]:
class FeedbackLogger:
    """Logs user interactions and feedback."""

    def __init__(self, log_file: str = 'data/logs/feedback.jsonl'):
        self.log_file = log_file
        # Ensureing the  directory exists
        os.makedirs(os.path.dirname(log_file), exist_ok=True)

    def log_interaction(
        self,
        user_id: str,
        session_id: str,
        event_type: str,
        data: Dict
    ):
        """
        Log a user interaction event.

        Args:
            user_id: User identifier
            session_id: Session identifier
            event_type: Type of event (generate, rank, select, edit, reject)
            data: Event-specific data
        """
        log_entry = {
            'timestamp': datetime.now().isoformat(),
            'user_id': user_id,
            'session_id': session_id,
            'event_type': event_type,
            'data': data
        }

        # Append to JSONL file
        with open(self.log_file, 'a') as f:
            f.write(json.dumps(log_entry) + '\n')

    def load_logs(self, user_id: Optional[str] = None) -> List[Dict]:
        """Load logs, optionally filtered by user_id."""
        if not os.path.exists(self.log_file):
            return []

        logs = []
        with open(self.log_file, 'r') as f:
            for line in f:
                entry = json.loads(line.strip())
                if user_id is None or entry['user_id'] == user_id:
                    logs.append(entry)

        return logs

    def get_user_feedback_data(self, user_id: str) -> pd.DataFrame:
        """Get feedback data for a user as DataFrame."""
        logs = self.load_logs(user_id)

        # Extract relevant feedback events
        feedback_records = []
        for log in logs:
            if log['event_type'] in ['select', 'reject', 'rate']:
                record = {
                    'timestamp': log['timestamp'],
                    'event_type': log['event_type'],
                    **log['data']
                }
                feedback_records.append(record)

        return pd.DataFrame(feedback_records)

# Test feedback logging
print("\n Testing Feedback Logger ")
logger = FeedbackLogger()

# Simulate some interactions
test_session = 'session_test_001'
test_user = 'user_001'

logger.log_interaction(
    user_id=test_user,
    session_id=test_session,
    event_type='generate',
    data={
        'theme': 'ocean waves',
        'form': 'haiku',
        'num_candidates': 10
    }
)

logger.log_interaction(
    user_id=test_user,
    session_id=test_session,
    event_type='select',
    data={
        'text': 'Moonlight on the waves, silver ripples dance freely',
        'rank': 1,
        'score': 0.87
    }
)

logger.log_interaction(
    user_id=test_user,
    session_id=test_session,
    event_type='rate',
    data={
        'text': 'Moonlight on the waves, silver ripples dance freely',
        'rating': 0.9
    }
)

print(f"✓ Logged 3 interactions to {logger.log_file}")

# Load and display logs
logs = logger.load_logs(test_user)
print(f"\n✓ Loaded {len(logs)} log entries for {test_user}")
for log in logs:
    print(f"  {log['event_type']:10s} | {log['timestamp'][:19]}")


 Testing Feedback Logger 
✓ Logged 3 interactions to data/logs/feedback.jsonl

✓ Loaded 3 log entries for user_001
  generate   | 2025-11-03T20:41:53
  select     | 2025-11-03T20:41:53
  rate       | 2025-11-03T20:41:53


In [6]:
# Complete Workflow Simulation

# Step 1: Createing new user with cold-start
user_id = 'alice_001'
session_id = f'session_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
theme = 'nature'

print(f"\n1. Creating user profile for {user_id}")
user_profile = initialize_profile_with_theme(user_id, theme, embedding_model)

# Step 2: Generateing candidates
print(f"\n2. Generating candidates (simulated)")
candidates = [
    "Autumn leaves dance gently in the cooling breeze",
    "Mountains stand eternal, watching seasons change",
    "The forest whispers secrets only trees can know",
    "Rivers carve their paths through ancient stone",
    "Sunrise paints the sky in shades of gold"
]
logger.log_interaction(user_id, session_id, 'generate', {
    'theme': theme,
    'num_candidates': len(candidates)
})

# Step 3: Rank candidates (simulated - would use notebook 2)
print(f"\n3. Ranking candidates")
from sklearn.metrics.pairwise import cosine_similarity

candidate_embeddings = embedding_model.encode(candidates)
similarities = cosine_similarity(
    candidate_embeddings,
    user_profile.profile_vector.reshape(1, -1)
).flatten()

ranked = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

print("\nRanked results:")
for i, (text, score) in enumerate(ranked, 1):
    print(f"  {i}. [{score:.3f}] {text}")

logger.log_interaction(user_id, session_id, 'rank', {
    'method': 'preference_embedding',
    'top_score': float(ranked[0][1])
})

# Step 4: User selects a candidate
print(f"\n4. User selects top candidate")
selected_text = ranked[0][0]
selected_score = ranked[0][1]

print(f"Selected: {selected_text}")

logger.log_interaction(user_id, session_id, 'select', {
    'text': selected_text,
    'rank': 1,
    'score': float(selected_score)
})

# Step 5: Update profile with feedback
print(f"\n5. Updating user profile")
rating = 0.85  # User gave positive feedback

user_profile.update_with_feedback(
    selected_text,
    embedding_model,
    rating=rating,
    alpha=0.2
)

logger.log_interaction(user_id, session_id, 'rate', {
    'text': selected_text,
    'rating': rating
})

# Step 6: Save updated profile
print(f"\n6. Saving profile")
user_profile.save()

print(f"\n✓ Workflow complete!")
print(f"  Updates: {user_profile.metadata['update_count']}")
print(f"  Cold-start: {user_profile.metadata['cold_start']}")


1. Creating user profile for alice_001
✓ Profile initialized from 3 samples

2. Generating candidates (simulated)

3. Ranking candidates

Ranked results:
  1. [0.702] Mountains stand eternal, watching seasons change
  2. [0.656] Autumn leaves dance gently in the cooling breeze
  3. [0.530] Sunrise paints the sky in shades of gold
  4. [0.481] The forest whispers secrets only trees can know
  5. [0.411] Rivers carve their paths through ancient stone

4. User selects top candidate
Selected: Mountains stand eternal, watching seasons change

5. Updating user profile

6. Saving profile
✓ Profile saved to data/profiles/profile_alice_001.json

✓ Workflow complete!
  Updates: 1
  Cold-start: False


In [8]:
#Multi-Session Simulation
def simulate_session(
    user_id: str,
    theme: str,
    candidates: List[str],
    user_preference: str,
    logger: FeedbackLogger,
    embedding_model
) -> UserProfile:
    """Simulate a complete user session."""

    session_id = f'session_{datetime.now().strftime("%Y%m%d_%H%M%S_%f")}'

    # Loading or createing profile
    try:
        profile = UserProfile.load(user_id)
    except:
        profile = initialize_profile_with_theme(user_id, theme, embedding_model)

    # Logging generation
    logger.log_interaction(user_id, session_id, 'generate', {
        'theme': theme,
        'num_candidates': len(candidates)
    })

    # Ranking by profile
    if profile.profile_vector is not None:
        candidate_embs = embedding_model.encode(candidates)
        scores = cosine_similarity(
            candidate_embs,
            profile.profile_vector.reshape(1, -1)
        ).flatten()
    else:
        scores = np.random.random(len(candidates))

    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

    # User selects based on preference
    pref_emb = embedding_model.encode(user_preference)
    candidate_embs = embedding_model.encode(candidates)
    pref_scores = cosine_similarity(
        candidate_embs,
        pref_emb.reshape(1, -1)
    ).flatten()

    selected_idx = np.argmax(pref_scores)
    selected_text = candidates[selected_idx]
    rating = float(pref_scores[selected_idx]) # Convert numpy float32 to standard float

    # Logging selection
    logger.log_interaction(user_id, session_id, 'select', {
        'text': selected_text,
        'rating': rating
    })

    # Updateing profile
    profile.update_with_feedback(selected_text, embedding_model, rating=rating)
    profile.save()

    return profile

# Simulate multiple sessions for user
user_id = 'bob_002'
sessions_data = [
    {
        'theme': 'nature',
        'candidates': [
            "The mountain peak touches the morning clouds",
            "Green valleys stretch far beneath the sky",
            "Ancient trees stand guard over crystal streams"
        ],
        'preference': "I love descriptions of mountains and valleys"
    },
    {
        'theme': 'nature',
        'candidates': [
            "Winter frost decorates every blade of grass",
            "The eagle soars high above rocky cliffs",
            "Wildflowers bloom across the meadow"
        ],
        'preference': "I prefer imagery with animals and wildlife"
    },
    {
        'theme': 'nature',
        'candidates': [
            "The hawk circles slowly in the afternoon sun",
            "Deer graze peacefully in the quiet glade",
            "Rabbits dart between the shadowy ferns"
        ],
        'preference': "Wildlife in natural settings appeals to me"
    }
]

print(f"Simulating {len(sessions_data)} sessions for {user_id}")
for i, session_data in enumerate(sessions_data, 1):
    print(f"\n Session {i} ")
    profile = simulate_session(
        user_id,
        session_data['theme'],
        session_data['candidates'],
        session_data['preference'],
        logger,
        embedding_model
    )
    print(f"  Updates: {profile.metadata['update_count']}")

print(f"\n✓ Multi-session simulation complete")

Simulating 3 sessions for bob_002

 Session 1 
✓ Profile initialized from 3 samples
✓ Profile saved to data/profiles/profile_bob_002.json
  Updates: 1

 Session 2 
✓ Loaded profile for bob_002
✓ Profile saved to data/profiles/profile_bob_002.json
  Updates: 2

 Session 3 
✓ Loaded profile for bob_002
✓ Profile saved to data/profiles/profile_bob_002.json
  Updates: 3

✓ Multi-session simulation complete


In [9]:

#Analyzes the  User Feedback History now

def analyze_user_feedback(user_id: str, logger: FeedbackLogger):
    """Analyze feedback patterns for a user."""

    df = logger.get_user_feedback_data(user_id)

    if df.empty:
        print(f"No feedback data for {user_id}")
        return

    print(f"\nUser: {user_id}")
    print(f"Total feedback events: {len(df)}")
    print(f"\nEvent type breakdown:")
    print(df['event_type'].value_counts())

    if 'rating' in df.columns:
        ratings = df[df['rating'].notna()]['rating']
        if len(ratings) > 0:
            print(f"\nRating statistics:")
            print(f"  Mean: {ratings.mean():.3f}")
            print(f"  Std:  {ratings.std():.3f}")
            print(f"  Min:  {ratings.min():.3f}")
            print(f"  Max:  {ratings.max():.3f}")

    return df

# Analyze feedback for our test users
for uid in ['alice_001', 'bob_002']:
    df = analyze_user_feedback(uid, logger)


User: alice_001
Total feedback events: 2

Event type breakdown:
event_type
select    1
rate      1
Name: count, dtype: int64

Rating statistics:
  Mean: 0.850
  Std:  nan
  Min:  0.850
  Max:  0.850

User: bob_002
Total feedback events: 4

Event type breakdown:
event_type
select    4
Name: count, dtype: int64

Rating statistics:
  Mean: 0.351
  Std:  0.150
  Min:  0.164
  Max:  0.472


In [10]:
# Profile Comparison
def compare_profiles(user_ids: List[str]) -> pd.DataFrame:
    """Compare multiple user profiles."""

    profiles = []
    for uid in user_ids:
        try:
            prof = UserProfile.load(uid)
            if prof.profile_vector is not None:
                profiles.append({
                    'user_id': uid,
                    'vector': prof.profile_vector,
                    'updates': prof.metadata['update_count'],
                    'cold_start': prof.metadata.get('cold_start', True)
                })
        except:
            pass

    if len(profiles) < 2:
        print("Need at least 2 profiles to compare")
        return None

    # Compute pairwise similarities
    print("\n Profile Similarity Matrix ")
    n = len(profiles)
    for i in range(n):
        for j in range(i+1, n):
            vec_i = profiles[i]['vector'].reshape(1, -1)
            vec_j = profiles[j]['vector'].reshape(1, -1)
            sim = cosine_similarity(vec_i, vec_j)[0][0]
            print(f"{profiles[i]['user_id']} <-> {profiles[j]['user_id']}: {sim:.3f}")

    # Create summary DataFrame
    df = pd.DataFrame([
        {
            'user_id': p['user_id'],
            'updates': p['updates'],
            'cold_start': p['cold_start']
        }
        for p in profiles
    ])

    return df

# Compare profiles

comparison_df = compare_profiles(['alice_001', 'bob_002'])
if comparison_df is not None:
    print("\n", comparison_df)

✓ Loaded profile for alice_001
✓ Loaded profile for bob_002

 Profile Similarity Matrix 
alice_001 <-> bob_002: 0.988

      user_id  updates  cold_start
0  alice_001        1       False
1    bob_002        3       False


In [11]:
#  Export Summary Statistics
def export_summary_stats(logger: FeedbackLogger, output_file: str = 'data/logs/summary.json'):
    """Export summary statistics for all users."""

    all_logs = logger.load_logs()

    if not all_logs:
        print("No logs to summarize")
        return

    # Aggregate by user
    user_stats = {}
    for log in all_logs:
        uid = log['user_id']
        if uid not in user_stats:
            user_stats[uid] = {
                'total_events': 0,
                'event_types': {},
                'sessions': set()
            }

        user_stats[uid]['total_events'] += 1
        event_type = log['event_type']
        user_stats[uid]['event_types'][event_type] = \
            user_stats[uid]['event_types'].get(event_type, 0) + 1
        user_stats[uid]['sessions'].add(log['session_id'])

    # Converting sets to counts
    for uid in user_stats:
        user_stats[uid]['num_sessions'] = len(user_stats[uid]['sessions'])
        del user_stats[uid]['sessions']

    summary = {
        'total_users': len(user_stats),
        'total_events': len(all_logs),
        'users': user_stats
    }

    with open(output_file, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"✓ Exported summary to {output_file}")
    print(f"\nTotal users: {summary['total_users']}")
    print(f"Total events: {summary['total_events']}")

    return summary

summary = export_summary_stats(logger)


✓ Exported summary to data/logs/summary.json

Total users: 3
Total events: 15
