In [7]:
import pandas as pd

In [13]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

listening_df = pd.read_csv("datasets/user_clean/ReRe_20250602_164123.csv")
charts_df = pd.read_csv("datasets/info_clean/info_charts_weighted.csv")

def calculate_chart_listening_points(listening_df, charts_df, window_days=30):
    """
    Calculate points for listening to chart songs within a rolling window.

    Parameters:
    - listening_df: DataFrame with columns ['datetime', 'artist_name', 'track_name']
    - charts_df: DataFrame with columns ['weekdate', 'artist_name', 'track_name', 'position', 'weighting']
    - window_days: Rolling window in days (default 30 for ~1 month)

    Returns:
    - DataFrame with listening instances and awarded points
    """

    # Ensure datetime columns are properly formatted
    listening_df = listening_df.copy()
    charts_df = charts_df.copy()

    listening_df['datetime'] = pd.to_datetime(listening_df['datetime'])
    charts_df['weekdate'] = pd.to_datetime(charts_df['weekdate'])

    # Create a list to store results
    results = []

    # For each listening instance
    for idx, listen_row in listening_df.iterrows():
        listen_datetime = listen_row['datetime']
        artist = listen_row['artist_name']
        track = listen_row['track_name']

        # Define the rolling window (30 days before and after the listen date)
        window_start = listen_datetime - timedelta(days=window_days)
        window_end = listen_datetime + timedelta(days=window_days)

        # Find matching chart entries within the window
        chart_matches = charts_df[
            (charts_df['artist_name'] == artist) &
            (charts_df['track_name'] == track) &
            (charts_df['weekdate'] >= window_start) &
            (charts_df['weekdate'] <= window_end)
        ]

        # Calculate total points for this listening instance
        total_points = chart_matches['weighting'].sum() if not chart_matches.empty else 0

        # Store the result
        result_row = {
            'datetime': listen_datetime,
            'artist_name': artist,
            'track_name': track,
            'points_awarded': total_points,
            'chart_weeks_matched': len(chart_matches)
        }

        # Optional: Add details about which chart weeks were matched
        if not chart_matches.empty:
            result_row['chart_weeks_detail'] = chart_matches[['weekdate', 'position', 'weighting']].to_dict('records')
        else:
            result_row['chart_weeks_detail'] = []

        results.append(result_row)

    return pd.DataFrame(results)

def calculate_listener_summary(points_df):
    """
    Calculate summary statistics for the listener.

    Parameters:
    - points_df: DataFrame returned from calculate_chart_listening_points

    Returns:
    - Dictionary with summary statistics
    """

    total_listens = len(points_df)
    chart_listens = len(points_df[points_df['points_awarded'] > 0])
    total_points = points_df['points_awarded'].sum()
    avg_points_per_listen = points_df['points_awarded'].mean()

    # Points by time period
    points_df_copy = points_df.copy()
    points_df_copy['date'] = points_df_copy['datetime'].dt.date
    points_df_copy['week'] = points_df_copy['datetime'].dt.to_period('W')
    points_df_copy['month'] = points_df_copy['datetime'].dt.to_period('M')

    daily_points = points_df_copy.groupby('date')['points_awarded'].sum()
    weekly_points = points_df_copy.groupby('week')['points_awarded'].sum()
    monthly_points = points_df_copy.groupby('month')['points_awarded'].sum()

    return {
        'total_listening_instances': total_listens,
        'chart_song_listens': chart_listens,
        'chart_hit_rate': chart_listens / total_listens if total_listens > 0 else 0,
        'total_points': total_points,
        'average_points_per_listen': avg_points_per_listen,
        'best_day_points': daily_points.max() if not daily_points.empty else 0,
        'best_week_points': weekly_points.max() if not weekly_points.empty else 0,
        'best_month_points': monthly_points.max() if not monthly_points.empty else 0
    }

def optimize_for_large_datasets(listening_df, charts_df, window_days=30):
    """
    Optimized version for large datasets using vectorized operations.
    More memory efficient but requires sufficient RAM.
    """

    # Prepare data
    listening_df = listening_df.copy()
    charts_df = charts_df.copy()

    listening_df['datetime'] = pd.to_datetime(listening_df['datetime'])
    charts_df['weekdate'] = pd.to_datetime(charts_df['weekdate'])

    # Create composite keys for faster matching
    listening_df['song_key'] = listening_df['artist_name'] + '|||' + listening_df['track_name']
    charts_df['song_key'] = charts_df['artist_name'] + '|||' + charts_df['track_name']

    # Pre-filter charts to only songs that appear in listening history
    relevant_songs = set(listening_df['song_key'].unique())
    charts_filtered = charts_df[charts_df['song_key'].isin(relevant_songs)].copy()

    results = []

    # Group charts by song for faster lookup
    charts_grouped = charts_filtered.groupby('song_key')

    for song_key in relevant_songs:
        if song_key not in charts_grouped.groups:
            continue

        song_charts = charts_grouped.get_group(song_key)
        song_listens = listening_df[listening_df['song_key'] == song_key]

        for _, listen_row in song_listens.iterrows():
            listen_datetime = listen_row['datetime']

            # Calculate window
            window_start = listen_datetime - timedelta(days=window_days)
            window_end = listen_datetime + timedelta(days=window_days)

            # Find matching chart weeks
            matching_weeks = song_charts[
                (song_charts['weekdate'] >= window_start) &
                (song_charts['weekdate'] <= window_end)
            ]

            total_points = matching_weeks['weighting'].sum() if not matching_weeks.empty else 0

            results.append({
                'datetime': listen_datetime,
                'artist_name': listen_row['artist_name'],
                'track_name': listen_row['track_name'],
                'points_awarded': total_points,
                'chart_weeks_matched': len(matching_weeks)
            })

    return pd.DataFrame(results)

# Example usage and data preparation functions
# def prepare_sample_data():
    """
    Create sample data for testing the functions.
    """

    # Sample listening data
    listening_data = {
        'datetime': [
            '2024-01-15 14:30:25.123',
            '2024-01-16 09:15:30.456',
            '2024-01-20 20:45:10.789',
            '2024-02-01 11:20:05.234',
            '2024-02-15 16:35:45.567'
        ],
        'artist_name': ['Taylor Swift', 'Ed Sheeran', 'Billie Eilish', 'Taylor Swift', 'Drake'],
        'track_name': ['Anti-Hero', 'Shape of You', 'Bad Guy', 'Shake It Off', 'God\'s Plan']
    }

    # Sample chart data
    chart_data = {
        'weekdate': [
            '2024-01-08', '2024-01-15', '2024-01-22',
            '2024-01-08', '2024-01-15', '2024-01-22',
            '2024-02-05', '2024-02-12', '2024-02-19'
        ],
        'artist_name': [
            'Taylor Swift', 'Taylor Swift', 'Taylor Swift',
            'Ed Sheeran', 'Ed Sheeran', 'Billie Eilish',
            'Taylor Swift', 'Drake', 'Drake'
        ],
        'track_name': [
            'Anti-Hero', 'Anti-Hero', 'Anti-Hero',
            'Shape of You', 'Shape of You', 'Bad Guy',
            'Shake It Off', 'God\'s Plan', 'God\'s Plan'
        ],
        'position': [1, 2, 5, 10, 15, 8, 25, 3, 7],
        'weighting': [50, 49, 46, 41, 36, 43, 26, 48, 44]
    }

    listening_df = pd.DataFrame(listening_data)
    charts_df = pd.DataFrame(chart_data)

    return listening_df, charts_df

# Main execution example
# if __name__ == "__main__":
    # Load your data (replace with your actual data loading)
    # listening_df = pd.read_csv('your_listening_data.csv')
    # charts_df = pd.read_csv('your_chart_data.csv')

    # For demonstration, use sample data
    listening_df, charts_df = prepare_sample_data()

    print("Sample Listening Data:")
    print(listening_df.head())
    print("\nSample Chart Data:")
    print(charts_df.head())

    # Calculate points
    points_result = calculate_chart_listening_points(listening_df, charts_df, window_days=30)

    print("\nPoints Calculation Results:")
    print(points_result[['datetime', 'artist_name', 'track_name', 'points_awarded', 'chart_weeks_matched']])

    # Get summary statistics
    summary = calculate_listener_summary(points_result)

    print("\nListener Summary:")
    for key, value in summary.items():
        if isinstance(value, float):
            print(f"{key}: {value:.2f}")
        else:
            print(f"{key}: {value}")

    # Show detailed breakdown for chart hits
    chart_hits = points_result[points_result['points_awarded'] > 0]
    if not chart_hits.empty:
        print(f"\nDetailed Chart Hits ({len(chart_hits)} songs):")
        for idx, row in chart_hits.iterrows():
            print(f"\n{row['datetime'].strftime('%Y-%m-%d %H:%M')}: {row['artist_name']} - {row['track_name']}")
            print(f"  Points awarded: {row['points_awarded']}")
            print(f"  Chart weeks matched: {row['chart_weeks_matched']}")

In [14]:
results

NameError: name 'results' is not defined