In [4]:
import pandas as pd
import json
from typing import List, Dict
import random
import numpy as np

def convert_to_serializable(obj):
    """Convert numpy/pandas numeric types to Python native types."""
    if isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

def get_similar_videos(df: pd.DataFrame, current_video: pd.Series, num_samples: int = 3) -> List[Dict]:
    """
    Helper function to get similar videos with proper error handling
    """
    # Try exact category and duration match
    similar_videos_df = df[
        (df['Category'] == current_video['Category']) & 
        (df['Duration Category'] == current_video['Duration Category']) &
        (df['Title'] != current_video['Title'])
    ]
    
    # If no matches, try category only
    if len(similar_videos_df) == 0:
        similar_videos_df = df[
            (df['Category'] == current_video['Category']) &
            (df['Title'] != current_video['Title'])
        ]
    
    # If still no matches, get random videos from different category
    if len(similar_videos_df) == 0:
        similar_videos_df = df[df['Title'] != current_video['Title']]
    
    # Determine number of samples to take
    n_samples = min(num_samples, len(similar_videos_df))
    
    if n_samples > 0:
        # Convert to records and ensure all numeric values are serializable
        records = similar_videos_df.sample(n=n_samples).to_dict('records')
        return [{k: convert_to_serializable(v) for k, v in record.items()} for record in records]
    else:
        return []

def create_instruction_dataset(df: pd.DataFrame) -> List[Dict]:
    """
    Create instruction-following dataset with robust error handling
    """
    instructions = []
    
    instruction_templates = [
        "Given this video viewing history, recommend similar videos:",
        "Based on this watched video, suggest what to watch next:",
        "Find videos similar to this one in terms of content and duration:",
        "Recommend videos for someone who enjoyed watching this:",
        "What other videos would interest someone who watched this video?"
    ]
    
    for idx, row in df.iterrows():
        try:
            # Get recent viewing history for context
            history = df.iloc[max(0, idx-3):idx].to_dict('records')
            history = [{k: convert_to_serializable(v) for k, v in record.items()} for record in history]
            
            # Create video context
            video_context = {
                'current_video': {
                    'title': row['Title'],
                    'category': row['Category'],
                    'duration': convert_to_serializable(row['Duration']),
                    'duration_category': row['Duration Category'],
                    'days_since_publication': convert_to_serializable(row['Days Since Publication'])
                },
                'viewing_history': [{'title': v['Title'], 'category': v['Category']} for v in history]
            }
            
            # Get similar videos
            similar_videos = get_similar_videos(df, row)
            
            if not similar_videos:
                continue
            
            # Format output
            output = {
                'recommendations': [
                    {
                        'title': video['Title'],
                        'category': video['Category'],
                        'reason': (f"Similar {video['Category']} content with {video['Duration Category']} duration" 
                                 if video['Duration Category'] == row['Duration Category']
                                 else f"Similar {video['Category']} content")
                    }
                    for video in similar_videos
                ]
            }
            
            # Verify JSON serialization works
            try:
                json.dumps(video_context)
                json.dumps(output)
            except TypeError:
                continue
            
            # Create instruction sample
            instruction = {
                'instruction': random.choice(instruction_templates),
                'input': json.dumps(video_context, indent=2),
                'output': json.dumps(output, indent=2)
            }
            
            instructions.append(instruction)
            
        except Exception as e:
            print(f"Error processing row {idx}: {str(e)}")
            continue
    
    return instructions

def create_validation_samples(df: pd.DataFrame, num_samples: int = 50) -> List[Dict]:
    """Create validation samples with robust error handling."""
    validation_samples = []
    attempts = 0
    max_attempts = num_samples * 2  # Allow some retry attempts
    
    while len(validation_samples) < num_samples and attempts < max_attempts:
        try:
            # Randomly select a video
            row = df.sample(1).iloc[0]
            
            # Get similar videos
            similar_videos = get_similar_videos(df, row, num_samples=2)
            
            if not similar_videos:
                attempts += 1
                continue
            
            # Create validation sample with serializable values
            sample = {
                'instruction': "Recommend similar videos based on this viewing:",
                'input': json.dumps({
                    'video': {
                        'title': row['Title'],
                        'category': row['Category'],
                        'duration': convert_to_serializable(row['Duration']),
                    }
                }, indent=2),
                'output': json.dumps({
                    'recommendations': [
                        {
                            'title': video['Title'],
                            'category': video['Category'],
                        }
                        for video in similar_videos
                    ]
                }, indent=2)
            }
            
            # Verify JSON serialization works
            try:
                json.dumps(sample)
                validation_samples.append(sample)
            except TypeError as e:
                print(f"Error creating validation sample: {str(e)}")
                attempts += 1
                continue
            
        except Exception as e:
            print(f"Error creating validation sample: {str(e)}")
            attempts += 1
            continue
    
    return validation_samples

def save_instruction_dataset(instructions: List[Dict], output_path: str):
    """Save the instruction dataset in JSONL format."""
    with open(output_path, 'w', encoding='utf-8') as f:
        for instruction in instructions:
            f.write(json.dumps(instruction) + '\n')

def main():
    try:
        # Load dataset
        print("Loading dataset...")
        df = pd.read_csv('cleaned_data_final.csv')
        
        if len(df) == 0:
            raise ValueError("Empty dataset")
        
        # Create training instructions
        print("Creating training instructions...")
        train_instructions = create_instruction_dataset(df)
        
        if not train_instructions:
            raise ValueError("No training instructions could be created")
        
        # Create validation instructions
        print("Creating validation instructions...")
        valid_instructions = create_validation_samples(df)
        
        if not valid_instructions:
            raise ValueError("No validation instructions could be created")
        
        # Save datasets
        print("Saving datasets...")
        save_instruction_dataset(train_instructions, 'train_instructions.jsonl')
        save_instruction_dataset(valid_instructions, 'valid_instructions.jsonl')
        
        print(f"Created {len(train_instructions)} training samples")
        print(f"Created {len(valid_instructions)} validation samples")
        
        # Print example
        print("\nExample instruction:")
        print(json.dumps(train_instructions[0], indent=2))
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading dataset...
Creating training instructions...
Creating validation instructions...
Saving datasets...
Created 130 training samples
Created 50 validation samples

Example instruction:
{
  "instruction": "Recommend videos for someone who enjoyed watching this:",
  "input": "{\n  \"current_video\": {\n    \"title\": \"Why is the Indian Economy bleeding talent? Socio-economic case study\",\n    \"category\": \"Education\",\n    \"duration\": 1227,\n    \"duration_category\": 5,\n    \"days_since_publication\": 0\n  },\n  \"viewing_history\": []\n}",
  "output": "{\n  \"recommendations\": [\n    {\n      \"title\": \"Is Modi Confident of a 2024 Win? | What's the Maths of INDIA Alliance? | Akash Banerjee & Adwaith\",\n      \"category\": \"Education\",\n      \"reason\": \"Similar Education content with 5 duration\"\n    }\n  ]\n}"
}
