In [42]:
import os
import json
import pandas as pd
from datetime import datetime
from time import time
from hashlib import md5
from urllib.parse import urlparse

In [43]:
def load_json_file(file_path):
    with open(file_path) as f:
        records = [json.loads(line) for line in f]
    return records

In [44]:
def converted_timestamp(timestamp, unix=True):
    if pd.isna(timestamp):  
        return timestamp  
    if unix_formated:
        return timestamp
    else:
        return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

In [45]:
def AS_dataframe(records, unix_formated):
    if not records:
        print("No records to transform.")
        return None
    
    df = pd.DataFrame(records)
    
    df_Out = pd.DataFrame({
        'web_browser': df['a'].astype(str).str.split('(', 1).str[0],
        'operating_sys': df['a'].astype(str).str.split('(', 1).str[1].str.split(';', 1).str[0], 
        
        
        'from_url': df['r'].str.split('/').str[2].str.replace('www.', ''),
        'to_url': df['u'].str.split('/').str[2].str.replace('www.', ''),
        
        
        
        'city': df['cy'],
        
        
        'latitude': df['ll'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 0),
        'longitude': df['ll'].apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else 0),
        
        
        
        'time_zone': df['tz'],
        'time_in': df['t'].apply(lambda x: converted_timestamp(x, unix_formated)), 
        'time_out': df['hc'].apply(lambda x: converted_timestamp(x, unix_formated)),  
    })
    
    df_Out.dropna(inplace=True)

    return df_Out

In [46]:
def save_to_csv(df, output_directory, file_name):
    
        os.makedirs(output_directory, exist_ok=True)
        file_path = os.path.join(output_directory, file_name)
        df.to_csv(file_path, index=False)

In [47]:
def check_duplicates(df):
    if df is not None and df.duplicated().any():
        print("Warning: Duplicates found in the DataFrame!")
        return True
    return False

In [48]:
def main(directory_path, unix_formated):
    if not os.path.isdir(directory_path):
        print(f"Error: Directory '{directory_path}' does not exist.")
        return
    
    json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
    if not json_files:
        print("No JSON files found in the specified directory.")
        return
    
    start_time = time()  
    
    output_directory = os.path.join(directory_path, 'target')
    
    for json_file in json_files:
        file_path = os.path.join(directory_path, json_file)
        
        records = load_json_file(file_path)
        if records is None:
            continue 
        
        df_Out = AS_dataframe(records, unix_formated)
        
        if df_Out is not None:
            if check_duplicates(df_Out):
                df_Out.drop_duplicates(inplace=True)
            
            csv_name = os.path.splitext(json_file)[0] + '.csv'
            save_to_csv(df_Out, output_directory, csv_name)
    
    execution_time = time() - start_time
    print(f"Total execution time: {execution_time:.2f} seconds.")

directory_path = os.getcwd()  
unix_formated = False  
main(directory_path, unix_formated)

  'from_url': df['r'].str.split('/').str[2].str.replace('www.', ''),
  'to_url': df['u'].str.split('/').str[2].str.replace('www.', ''),
  'from_url': df['r'].str.split('/').str[2].str.replace('www.', ''),


Total execution time: 0.37 seconds.


  'to_url': df['u'].str.split('/').str[2].str.replace('www.', ''),
  'from_url': df['r'].str.split('/').str[2].str.replace('www.', ''),
  'to_url': df['u'].str.split('/').str[2].str.replace('www.', ''),
