## <span style="color:red">Generating Log File With Corrupted Data</span>

In [3]:
import os
import random
from datetime import datetime

def generate_log_file(file_path):
    with open(file_path, 'w') as f:
        # Generate first log entry with correct format
        sng_id1 = random.randint(10000000, 99999999)
        user_id1 = random.randint(1000000, 9999999)
        country1 = random.choice(["BL", "GN", "BN", "ST", "PW", "HK", "NL", "HM", "NR", "GT", "GU"])
        f.write(f"{sng_id1} | {user_id1} | {country1}\n")

        # Generate second log entry with correct format
        sng_id2 = random.randint(10000000, 99999999)
        user_id2 = random.randint(1000000, 9999999)
        country2 = random.choice(["BL", "GN", "BN", "ST", "PW", "HK", "NL", "HM", "NR", "GT", "GU"])
        f.write(f"{sng_id2} | {user_id2} | {country2}\n")

        # Generate third log entry with a missing row
        f.write(f"{sng_id2} | {user_id2}\n")

        # Generate fourth log entry with missing values
        sng_id4 = random.randint(10000000, 99999999)
        f.write(f"{sng_id4} | | {country1}\n")

def main():
    # Create a folder for the log files
    folder_path = "sample_listen_custom"
    os.makedirs(folder_path, exist_ok=True)

    # Generate a log file with 4 lines
    file_name = "listen-custom.log"
    file_path = os.path.join(folder_path, file_name)
    generate_log_file(file_path)

if __name__ == "__main__":
    main()


## <span style="color:red"> Log File Analysis - Top 50 Songs by Country</span>


In [4]:
# Import required libraries
import glob  # For file pattern matching
from collections import defaultdict  # For creating a nested defaultdict

# Function to process a single log file and count song streams per country
def process_log_file(log_file_path):
    # Create a nested defaultdict to store song stream counts for each country
    streams_data = defaultdict(lambda: defaultdict(int))

    with open(log_file_path, 'r') as f:
        for line in f:
            try:
                # Split the line into song_id, user_id, and country using '|' as the separator
                sng_id, user_id, country = line.strip().split('|')
                # Convert song_id and user_id to integers (assuming they are numeric fields)
                sng_id = int(sng_id)
                user_id = int(user_id)
                # Increment the song stream count for the given country and song_id
                streams_data[country][sng_id] += 1
            except (ValueError, IndexError):
                # If there is an error (e.g., missing values or invalid format), skip the row and continue
                continue

    return streams_data

# Function to compute the top 50 songs per country based on their stream counts
def compute_top_50(streams_data):
    top_50 = {}

    for country, song_streams in streams_data.items():
        # Sort songs based on their stream counts in descending order
        sorted_songs = sorted(song_streams.items(), key=lambda x: x[1], reverse=True)
        # Take the top 50 songs with the highest stream counts
        top_50[country] = sorted_songs[:50]

    return top_50

def main():
    # Step 1: Get a List of Log Files for the last 10 days
    log_files_directory = "sample_listen_custom"
    log_files = glob.glob(os.path.join(log_files_directory, "listen-*.log"))
    log_files.sort(reverse=True)  # Sort in reverse order to get the latest log files first

    # Consider the last 7 log files if available, otherwise take as many as possible
    log_files = log_files[:7]

    # Step 2: Loop Over Log Files
    streams_data = defaultdict(lambda: defaultdict(int))

    for log_file in log_files:
        # Process each log file and update the streams_data dictionary with song stream counts
        streams_data_current_file = process_log_file(log_file)

        for country, song_streams in streams_data_current_file.items():
            for song, streams in song_streams.items():
                # Update the streams_data dictionary with the stream counts for each song in each country
                streams_data[country][song] += streams

    # Step 3: Compute Top 50 Across Dates
    top_50 = compute_top_50(streams_data)

    # Step 4: Write Output for All Dates
    output_directory = "country_top50_custom/"
    os.makedirs(output_directory, exist_ok=True)  # Create the output directory if it doesn't exist
    today_date_str = datetime.now().strftime('%Y%m%d')
    output_file_path = os.path.join(output_directory, f'country_top50_{today_date_str}.txt')
    with open(output_file_path, 'w') as f:
        for country, songs in top_50.items():
            # Format the output for each country's top 50 songs and write to the output file
            song_strings = [f'{sng_id}:{streams}' for sng_id, streams in songs]
            output_string = f"{country}|{','.join(song_strings)}"
            f.write(output_string + '\n')

if __name__ == "__main__":
    main()
