# Split Main Dataset

Main Dataset https://www.kaggle.com/datasets/johntukey/github-dataset

1. Import modules

In [None]:
import pandas as pd

2. Split dataset into desired number of splits

In [None]:
def splitMainDataset(infile, chunk_size=1000, total_lines=8356231, percent=5):
    lines_to_process = int(total_lines * (percent / 100))
    curr = 1
    processed_lines = 0
    first_chunk = True

    outfile = "split" + str(curr) + ".json"

    for chunk in pd.read_json(infile, lines=True, chunksize=chunk_size):
        # If adding the current chunk exceeds the target, trim the chunk
        if processed_lines + chunk.shape[0] > lines_to_process:
            chunk = chunk.iloc[:(lines_to_process - processed_lines)]

        users = []
        for _, row in chunk.iterrows():
            
            user_data = {
                'login': row.get('login', ''),
                'id': row.get('id', ''),
                'type': row.get('type', ''),
                'created_at': row.get('created_at', ''),
                'updated_at': row.get('updated_at', ''),
                'is_suspicious': row.get('is_suspicious', False),
                'followers': row.get('followers', 0),
                'following': row.get('following', 0),
                'commits': row.get('commits', 0),
                'commit_list': row.get('commit_list', []),
                'public_repos': row.get('public_repos', 0),
                'follower_list': row.get('follower_list', []),
                'following_list': row.get('following_list', []),
                'public_gists': row.get('public_gists', 0),  # Added public_gists
                'location': row.get('location', ''),  # Added location
                'hirable': row.get('hirable', False),  # Added hirable
                'company': row.get('company', ''),  # Added company
                'email': row.get('email', ''),
                'bio': row.get('bio', ''),
                'blog': row.get('blog', ''),
                'repo_list': row.get( 'repo_list', [])
            }
            users.append(user_data)

            processed_lines += 1  # Increment after processing a line

        if users:
            df_users = pd.DataFrame(users)
            df_users = df_users.dropna(subset=['login', 'id', 'followers', 'commits'])

            write_mode = 'w' if first_chunk else 'a'
            # ROBIN: changed to_csv to to_json
            df_users.to_json(outfile, mode = write_mode, index=False, lines=True, orient="records")
            # df_users.to_csv(outfile, mode=write_mode, index=False, header=first_chunk)
            first_chunk = False

        if curr > 100 / percent:
            break  # Stop if the desired number of lines have been processed

        if processed_lines >= lines_to_process:
            processed_lines  = 0
            curr += 1
            outfile = "split" + str(curr) + ".json"

In [None]:
splitMainDataset("dataset.json")