In [1]:
# Read the parquet file in data/siddhart_gandhi_refpred folder and print it's first 5 rows
import pandas as pd

In [2]:
import pandas as pd
import numpy as np

def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df['size'] = np.random.choice(['big','medium','small'], size)
    df['age'] = np.random.randint(1, 50, size)
    df['team'] = np.random.choice(['red','blue','yellow','green'], size)
    df['win'] = np.random.choice(['yes','no'], size)
    dates = pd.date_range('2020-01-01', '2022-12-31')
    df['date'] = np.random.choice(dates, size)
    df['prob'] = np.random.uniform(0, 1, size)
    return df

def set_dtypes(df):
    df['size'] = df['size'].astype('category')
    df['team'] = df['team'].astype('category')
    df['age'] = df['age'].astype('int16')
    df['win'] = df['win'].map({'yes':True, 'no': False})
    df['prob'] = df['prob'].astype('float32')
    return df

In [3]:
print('Reading and writing JSON')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_json('test.json', orient='records')
%time df_csv = pd.read_json('test.json', orient='records')

Reading and writing JSON
CPU times: user 2.73 s, sys: 269 ms, total: 3 s
Wall time: 3.04 s
CPU times: user 9.39 s, sys: 47.8 s, total: 57.2 s
Wall time: 1min 20s


In [5]:
!ls -GFlash test.json

902064 -rw-r--r--@ 1 siddharth  staff   440M Oct  3 11:40 test.json


In [6]:
print('Reading and writing Parquet')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')

Reading and writing Parquet
CPU times: user 356 ms, sys: 33.1 ms, total: 389 ms
Wall time: 413 ms
CPU times: user 127 ms, sys: 81 ms, total: 208 ms
Wall time: 121 ms


In [7]:
!ls -GFlash test.parquet

72192 -rw-r--r--@ 1 siddharth  staff    35M Oct  3 11:42 test.parquet


In [1]:
import pstats

# Create a pstats.Stats object from the .prof file
stats = pstats.Stats('kafka_scrape_local.prof')

# Sort the statistics by the cumulative time spent in each function
stats.sort_stats('cumulative')

# Print the top 5 functions that took the most time
stats.print_stats(50)

Mon Oct  2 19:02:56 2023    kafka_scrape_local.prof

         45412888 function calls (45411436 primitive calls) in 2818.620 seconds

   Ordered by: cumulative time
   List reduced from 1296 to 50 due to restriction <50>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.009    0.009 2819.265 2819.265 /Users/siddharth/dev/ds/scrape_local.py:230(main)
        1    0.223    0.223 2819.256 2819.256 /Users/siddharth/dev/ds/scrape_local.py:182(scrape_repository)
     6728    3.119    0.000 2804.386    0.417 /Users/siddharth/dev/ds/scrape_local.py:95(process_commit)
   201999    3.887    0.000 2799.199    0.014 /Users/siddharth/dev/ds/scrape_local.py:33(run_command)
   201999   13.257    0.000 2795.152    0.014 /opt/homebrew/Caskroom/miniconda/base/envs/ds/lib/python3.9/subprocess.py:464(run)
   201999    2.444    0.000 2298.388    0.011 /opt/homebrew/Caskroom/miniconda/base/envs/ds/lib/python3.9/subprocess.py:1090(communicate)
   201999   11.136    0.000

<pstats.Stats at 0x1105abf10>

In [3]:
# df = pd.read_parquet('./data/siddharth-gandhi_refpred/siddharth-gandhi_refpred_commit_data_0.parquet')
df = pd.read_parquet('./data/apache_kafka/apache_kafka_commit_data_0.parquet')
# df = pd.read_parquet('./data/facebook_react/facebook_react_commit_data_0.parquet', engine='fastparquet')

In [4]:
!ls -GFlash ./data/apache_kafka/

total 1465384
     0 drwxr-xr-x@  9 siddharth  staff   288B Oct  2 18:29 [1m[36m.[m[m/
     0 drwxr-xr-x@ 11 siddharth  staff   352B Oct  2 17:09 [1m[36m..[m[m/
217936 -rw-r--r--@  1 siddharth  staff   106M Oct  2 17:15 apache_kafka_commit_data_0.parquet
221176 -rw-r--r--@  1 siddharth  staff   108M Oct  2 17:21 apache_kafka_commit_data_1.parquet
214640 -rw-r--r--@  1 siddharth  staff   105M Oct  2 17:28 apache_kafka_commit_data_2.parquet
237448 -rw-r--r--@  1 siddharth  staff   116M Oct  2 17:35 apache_kafka_commit_data_3.parquet
204856 -rw-r--r--@  1 siddharth  staff   100M Oct  2 17:41 apache_kafka_commit_data_4.parquet
203232 -rw-r--r--@  1 siddharth  staff    93M Oct  2 18:22 apache_kafka_commit_data_5.parquet
166096 -rw-r--r--@  1 siddharth  staff    81M Oct  2 18:29 apache_kafka_commit_data_6.parquet


In [5]:
# get the diff column of the first row
df.head()

Unnamed: 0,owner,repo_name,commit_date,commit_id,commit_message,file_path,previous_commit_id,previous_file_content,cur_file_content,diff,status,is_merge_request
0,apache,kafka,2023-10-02 20:22:17+00:00,8f8dbad564ffd9be409bb85edadbc40659cd0a56,KAFKA-14595 ReassignPartitionsIntegrationTest ...,core/src/test/java/kafka/test/ClusterConfig.java,b6c7855475397166c4fbc79071b4aa93696519e5,/*  * Licensed to the Apache Software Foundati...,/*  * Licensed to the Apache Software Foundati...,"@@ -146,6 +146,11 @@ public class ClusterConfi...",modified,False
1,apache,kafka,2023-10-02 20:22:17+00:00,8f8dbad564ffd9be409bb85edadbc40659cd0a56,KAFKA-14595 ReassignPartitionsIntegrationTest ...,core/src/test/scala/integration/kafka/admin/Re...,b6c7855475397166c4fbc79071b4aa93696519e5,/*  * Licensed to the Apache Software Foundati...,,,deleted,False
2,apache,kafka,2023-10-02 20:22:17+00:00,8f8dbad564ffd9be409bb85edadbc40659cd0a56,KAFKA-14595 ReassignPartitionsIntegrationTest ...,tools/src/main/java/org/apache/kafka/tools/rea...,b6c7855475397166c4fbc79071b4aa93696519e5,/*  * Licensed to the Apache Software Foundati...,/*  * Licensed to the Apache Software Foundati...,"@@ -46,7 +46,7 @@ public final class VerifyAss...",modified,False
3,apache,kafka,2023-10-02 20:22:17+00:00,8f8dbad564ffd9be409bb85edadbc40659cd0a56,KAFKA-14595 ReassignPartitionsIntegrationTest ...,tools/src/test/java/org/apache/kafka/tools/Too...,b6c7855475397166c4fbc79071b4aa93696519e5,/*  * Licensed to the Apache Software Foundati...,/*  * Licensed to the Apache Software Foundati...,"@@ -16,12 +16,16 @@  */  package org.apache.k...",modified,False
4,apache,kafka,2023-10-02 20:22:17+00:00,8f8dbad564ffd9be409bb85edadbc40659cd0a56,KAFKA-14595 ReassignPartitionsIntegrationTest ...,tools/src/test/java/org/apache/kafka/tools/rea...,b6c7855475397166c4fbc79071b4aa93696519e5,,/*  * Licensed to the Apache Software Foundati...,,added,False


In [8]:
# number of unique commit_id
df.commit_id.nunique()

885

In [7]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5948 entries, 0 to 5947
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   owner                  5948 non-null   string             
 1   repo_name              5948 non-null   string             
 2   commit_date            5948 non-null   datetime64[ns, UTC]
 3   commit_id              5948 non-null   string             
 4   commit_message         5948 non-null   string             
 5   file_path              5948 non-null   string             
 6   previous_commit_id     5948 non-null   string             
 7   previous_file_content  5159 non-null   string             
 8   cur_file_content       5860 non-null   string             
 9   diff                   5072 non-null   string             
 10  status                 5948 non-null   category           
 11  is_merge_request       5948 non-null   bool             

In [None]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2434 entries, 0 to 2433
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   owner                  2434 non-null   string             
 1   repo_name              2434 non-null   string             
 2   commit_date            2434 non-null   datetime64[ns, UTC]
 3   commit_id              2434 non-null   string             
 4   commit_message         2434 non-null   string             
 5   file_path              2434 non-null   string             
 6   previous_commit_id     2430 non-null   string             
 7   previous_file_content  2172 non-null   string             
 8   cur_file_content       2425 non-null   string             
 9   diff                   2166 non-null   string             
 10  status                 2434 non-null   category           
 11  is_merge_request       2434 non-null   bool             