In [None]:
# Necessary imports
import os
import pandas as pd
import json


# Description
We transform the json files (each json file stores the metadata of one instagram post) into one single csv file, that stores the metadata for all the posts. Each row in the final csv file corresponds to one post and each column corresponds to specific metadata.

First we define two functions and apply them then to json files.
1. Function ('extract_data'): Transforms each json file into a dictionary
2. Function ('convert_info_files_to_csv'): Iterates over all json files, applies 'extract_data to each file and combines the resulting dictionaries in a resulting csv file

# Function definitions

In [None]:
# Function that extracts all the necessary data from a given json file
# Input: 'json_data' – the json object corresponding to the specific json file
# Input: 'file_name' – the file_path (string) to the corresponding json_file
# Output: data – a dictionary containg all the relevant data of the input json file

def extract_data(json_data, file_name):
    # Extract required fields from JSON data
    owner = json_data['owner']
    edge_media_preview_like = json_data['edge_media_preview_like']
    edge_media_to_tagged_user = json_data['edge_media_to_tagged_user']

    # Handle different key names for comments
    try:
        edge_media_to_parent_comment = json_data['edge_media_to_comment']
    except KeyError:
        edge_media_to_parent_comment = json_data['edge_media_to_parent_comment']

    edge_media_to_caption = json_data['edge_media_to_caption']

    # Handle cases where optional fields might be missing
    location = json_data.get('location', None)
    is_ad = json_data.get('is_ad', None)
    comments_disabled = json_data.get('comments_disabled', None)
    is_video = json_data.get('is_video', None)
    __typename = json_data.get('__typename', None)

    # Extract the text of the first element in the array of edge_media_to_caption.edges
    caption_text = None
    if edge_media_to_caption and edge_media_to_caption['edges']:
        caption_text = edge_media_to_caption['edges'][0]['node']['text']

    country_code = None
    if location and location['address_json']:
        location_cc = json.load(location['address_json'])
        country_code = location_cc["country_code"]

    # Create a dictionary with the extracted fields
    data = {
        'file_name': file_name,
        'owner.username': owner['username'],
        'owner.full_name': owner['full_name'],
        'owner.is_verified': owner['is_verified'],
        'owner.id': owner['id'],
        'owner.is_private': owner['is_private'],
        'edge_media_preview_like.count': edge_media_preview_like['count'],
        'count(edge_media_to_tagged_user.edges)': len(edge_media_to_tagged_user['edges']),
        'location': location,
        'is_ad': is_ad,
        'id': json_data['id'],
        'edge_media_to_parent_comment.count': edge_media_to_parent_comment['count'],
        'taken_at_timestamp': json_data['taken_at_timestamp'],
        'comments_disabled': comments_disabled,
        'is_video': is_video,
        '__typename': __typename,
        'caption_text': caption_text,
        'country_code': country_code
    }

    return data

In [None]:
# Function that converts all the json files into a singel csv file
# Input: Takes input_dir (the directory path where all the json files are stored), the output_file (name of the output csv file)
# and a start/ end index, that indicates which json files it should connvert as input parameters
# Result: Writes the csv file (output_file)

def convert_info_files_to_csv(input_dir, output_file, start_index=0, end_index=100):
    # List all .info files in the directory
    info_files = [f for f in os.listdir(input_dir) if f.endswith('.info')]

    # Slice the list to include only the first `max_files` files
    info_files = info_files[start_index:end_index]

    # Initialize a list to store all data
    all_data = []

    count = 0

    # Loop through each file and extract data
    for index, file in enumerate(info_files):
        file_path = os.path.join(input_dir, file)

        # Read the JSON file
        try:
            # Read the JSON file
            with open(file_path, 'r') as f:
                json_data = json.load(f)
        except json.JSONDecodeError:
            print(f"JSONDecodeError at index {index + start_index} for file {file}")
            continue

        # Extract data and append to the list
        data = extract_data(json_data, file)
        all_data.append(data)

    # Convert the list of data into a pandas DataFrame
    df = pd.DataFrame(all_data)

    # Write the DataFrame to a CSV file
    df.to_csv(output_file, index=False)

# Applying the Functions

In [None]:
# Define the directory, where all the json files are stored
input_dir = '/Users/samuelpfisterer/Downloads/7zipmac/extracted_files/info'

In [None]:
# Define the name of the csv file, where all the combined metadata from all the instagram post json files will be stored
output_file = 'csv_metadata.csv'

In [None]:
# Call the function, that converts creates the csv file with all the metadata.
# start_index = 0 and end_index = 600000 as we want to metadata for the first 6000000 json files, i.e. instagram posts
convert_info_files_to_csv(input_dir, output_file, start_index = 0 , end_index = 600000)