In [5]:
import sklearn as sk

In [6]:
import os
import time
import datetime
import json
import logging
import pandas as pd
import numpy as np
import itertools
import json
from pandas_profiling import ProfileReport
from tqdm.auto import tqdm

In [7]:
logging.basicConfig(
    format='%(levelname)s : %(message)s', 
    level=logging.ERROR, force=True)

In [8]:
logging.debug('debug')
logging.warning('warning')
logging.error('error')

ERROR : error


In [9]:
data = "./data/"  # folder holder of json 
output_dir = "./output"  # output folder ready data

In [10]:
file_names = os.listdir(data)
df = pd.DataFrame(file_names, columns=['name'])

In [11]:
df['type'] = df.name.str.extract('.*_([A-z]+).json')
df['repo_root'] = df.name.str.extract('(.*)_[A-z]+.json')

In [12]:
df1 = df.loc[df.type == "commits"].reset_index(drop=True)

In [13]:
def get_commit_from_json(json_file_name : str) -> list:
    """
    Extract date, author id and author email from the commit json file
    """
    logging.debug(f"Extract commits from {json_file_name}")
    extract_commit_clean = {
            'repository': os.path.basename(json_file_name).split('.')[0],
            'date': None,
            'author_mail': None,
            'author_name': None
            }
    output = []

    with open(json_file_name) as fd:
        commit_list =  json.load(fd)
        failed_err = ""
        failed_warning =""
        for commit in commit_list:
            try:
                extract_commit = extract_commit_clean.copy()
                extract_commit['date'] = commit['commit']['author']['date']
                extract_commit['author_mail'] = commit['commit']['author']['email']
                extract_commit['author_name'] = commit['commit']['author']['name']
            except:
                failed_err += f"Error in finding commit info : {sys.exc_info()[0]}\n"
            
            if extract_commit['author_mail'] == '' and extract_commit['author_mail'] == '':
                failed_warning += f"No info about mail or name of the author in {extract_commit['repository']}\n"
                continue
            elif extract_commit['author_mail'] == '':
                extract_commit['author_mail'] = extract_commit['author_name']
            elif extract_commit['author_name'] == '':
                extract_commit['author_name'] = extract_commit['author_mail']
            
            output.append(extract_commit)
    logging.debug(f"{len(output)} commits extracted")
    if failed_err:
        logging.error(failed_err)
    if failed_warning:
        logging.warning(failed_warning)
    return output

In [14]:
df_commits = pd.DataFrame(
    columns=["repository", "date", "author_mail", "author_name"])
for file_name in tqdm(df1["name"]):
    file_path = os.path.join(data, file_name)
    l_commits = get_commit_from_json(file_path)
    df_commits = pd.concat([df_commits, pd.DataFrame.from_records(l_commits)])
    
df_commits = df_commits.convert_dtypes()
df_commits.date = pd.to_datetime(df_commits.date)
df_commits.sort_values("date", inplace=True)
df_commits.reset_index(drop=True, inplace=True)

# Drop the unique commit from 1970
df_commits.drop(index=0, axis=0, inplace=True)
df_commits.reset_index(drop=True, inplace=True)


  0%|          | 0/592 [00:00<?, ?it/s]

In [15]:
#df_commits.to_csv("./checkpoint_1_raw_commit_list.csv")

In [16]:
#profile = ProfileReport(df_commits, title="Pandas Profiling Report")
#profile.to_file("commits_profile.html")

In [17]:
#time_start = datetime.datetime.strptime("2018+0000", "%Y%z")
#dfc = df_commits.loc[df_commits.date > time_start]

In [18]:
def get_commit_grp_by_repo(time_window_duration: pd.Timedelta, 
                            smallest_time:int, 
                            df: pd.DataFrame):

    time_binning = int(time_window_duration.total_seconds() / smallest_time) # binning of 6 hours during a week

    # Get list of author ordered by total number of commit
    df_to_grp = df.copy()
    df_to_grp = df_to_grp.reset_index()
    authors = df_to_grp.groupby('author_mail').count().sort_values('index', ascending=False).index.to_list()

    # create dict to encode author
    dict_author = dict(zip(authors, range(len(authors))))
    col = []
    for i in range(time_binning):
        col += [aut + f"_t_{i}" for aut in authors]

    # Create dataframe structure with columns as author1_t_0 author2_t_0 ... authorN_t_M
    #df_sparse = pd.DataFrame(columns=col, dtype=int)
    # It's too long to use dataframe, we will use a list of list where we write indice of value of 1
    logging.info(f"{len(col)} columns for the sparse data")
    
    df1 = df.copy()
    df1 = df1.reset_index()
    dfg = df1.set_index(['repository', 'index'])
    dfg.sort_index()
    

    return dfg, dict_author

In [33]:
def compile_input(dfg:pd.DataFrame, time_windows_duration:pd.Timedelta, time_delta, last_commit_date, dict_author, smallest_time):
    # From multiindex, get values of index
    list_repo = dfg.index.get_level_values(0).to_series().unique().tolist()

    meta_list_sparce = []
    metadata = []

    pbar = tqdm(total=1, desc="Time window loop", leave=True, position=2)

    for repo in tqdm(list_repo, desc="Repository loop", leave=True, position=1):
        # extract data about only one repo
        logging.debug(f"Filling for {repo}")
        data = dfg.loc[repo]

        # get the first commit's date
        start_date_repo = data.date.min()
        logging.debug(f"start date = {start_date_repo} or {repo}")

        # get the first boundary for the time window
        time_start_boundary = start_date_repo
        time_end_boundary = start_date_repo + time_windows_duration

        # calculer  le nombre d'itération du while pour ajouter un tqdm(total=max_iter)
        tqdm_total = int((last_commit_date - time_start_boundary).total_seconds() / time_delta.total_seconds())
        #pbar = tqdm(total=tqdm_total, desc="Time window loop", leave=False, position=1)
        pbar.reset(total=tqdm_total)
        while(time_end_boundary <= last_commit_date):

            list_index_of_1 = []

            data_window = data.loc[(data.date >= time_start_boundary) & (data.date < time_end_boundary)]

            logging.debug(f"{len(data_window)} commits founds between {time_start_boundary} and {time_end_boundary} for {repo}")

            for row in data_window.itertuples():

                # Get columns_id author_id + which time binning
                id_col = dict_author[row.author_mail] + len(dict_author) * int((row.date - time_start_boundary).total_seconds() / smallest_time)

                # Set 1 to last row and columns found before
                #df_sparse.iloc[-1, id_col] = 1
                # don't save duplicate
                last_item = -1 if len(list_index_of_1) == 0 else list_index_of_1[-1]
                if last_item != id_col:
                    list_index_of_1.append(id_col)

            if list_index_of_1:
                meta_list_sparce.append(list_index_of_1)
                metadata.append({
                                'id':(len(meta_list_sparce)-1),
                                "repo":repo,
                                "start_time":time_start_boundary.strftime("%Y-%m-%dT%Hh%Mm%Ss"),
                                "end_time":time_end_boundary.strftime("%Y-%m-%dT%Hh%Mm%Ss"),
                                }
                )
            # Increment boundaries with the time delta to move the time window
            time_start_boundary += time_delta
            time_end_boundary += time_delta

            # update progression bar
            pbar.update(1)
    pbar.close()
    return meta_list_sparce, metadata



In [34]:
def save_data(meta_list_sparce, output_dir, name):
    file_name = os.path.join(output_dir, f"{name}.dat")
    with open(file_name, 'w') as fd:
        for row in meta_list_sparce:
            if len(row) >= 1 :
                fd.write(' '.join(map(str, row)))
                fd.write('\n')
    return file_name


In [35]:
def save_metadata(metadata, output_dir, name):
    file_name = os.path.join(output_dir, f"{name}.txt")
    with open(file_name, 'w') as fd:
        json.dump(metadata, fd)
    return file_name


In [36]:
def get_stat(file_name:str):
    maxi = 0
    nbr_line = 0
    with open(file_name, 'r') as fd:
        for line in fd:
            if line.strip():
                
                                # drop newline
                l = line[:-1]
                if l == "":
                    continue
                if l[-1] == " ":
                    l = l[:-1]
                # get indices as array
                sl = l.split(" ")
                sl = [int(i) for i in sl]
                local_max = max(sl)
                    
                maxi = maxi if local_max < maxi else local_max
                nbr_line += 1
            
    return nbr_line, maxi
            

In [37]:
def from_commit_repo_by_week(tmd_i, tmd_t, td_i, td_t, df, smallest_time, suffix:str = "", output_dir="./"):
    time_windows_duration = pd.Timedelta(tmd_i, tmd_t) # 1 week
    time_delta = pd.Timedelta(td_i, td_t) # 1 day 

    dfg, dict_author = get_commit_grp_by_repo(time_window_duration=time_windows_duration, 
                                              smallest_time=smallest_time, 
                                              df=df)

    last_commit_date = df.date.max()
    logging.info(f"Latest commit found at {last_commit_date}")

    meta_list_sparce, metadata = compile_input(dfg=dfg, 
                                     time_windows_duration=time_windows_duration,
                                     time_delta=time_delta, 
                                     last_commit_date=last_commit_date, 
                                     smallest_time=smallest_time,
                                     dict_author=dict_author,)

    today = datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")
    name = f"github_cyber_{today}__{tmd_i}{tmd_t}_spaced_{td_i}{td_t}_smallest_{smallest_time/3600}"+suffix
    file_name = save_data(meta_list_sparce, output_dir, name)
    file_meta = save_metadata(metadata, output_dir, name)
    nbr_row, nbr_feature = get_stat(file_name)
    
    logging.info(f"{nbr_feature} features and {nbr_row} rows for {file_name}")
    return nbr_row, nbr_feature, file_name, file_meta

In [38]:
def get_sampling(data, nbr_of_sample=10,log=False):
    """
    Get sampling from data, uniform or log
    """
    repos = data.groupby('repository').count().sort_values('date', ascending=False).index.to_series().reset_index(drop=True)
    if log:
        index = gen_log_space(repos.index.max(), nbr_of_sample)
        repos_name = repos.iloc[index]
    else:
        repos_name = repos.sample(nbr_of_sample)

    logging.debug(f"Sampling {repos_name}")
    
    ret = data.loc[data.repository.isin(repos_name)]
    
    logging.info(f"From {len(data)} to {len(ret)} samples")
    return ret
    

In [39]:
def gen_log_space(limit, n):
    """
    limit (int) : max number to have
    n : how many sample
    """
    
    result = [1]
    if n>1:  # just a check to avoid ZeroDivisionError
        ratio = (float(limit)/result[-1]) ** (1.0/(n-len(result)))
    while len(result)<n:
        next_value = result[-1]*ratio
        if next_value - result[-1] >= 1:
            # safe zone. next_value will be a different integer
            result.append(next_value)
        else:
            # problem! same integer. we need to find next_value by artificially incrementing previous value
            result.append(result[-1]+1)
            # recalculate the ratio so that the remaining values will scale correctly
            ratio = (float(limit)/result[-1]) ** (1.0/(n-len(result)))
    # round, re-adjust to 0 indexing (i.e. minus 1) and return np.uint64 array
    return np.array(list(map(lambda x: round(x)-1, result)), dtype=np.uint64)

In [40]:
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s : %(message)s',
                    handlers=[logging.FileHandler("my_log.log", mode='w'),
                              stream_handler],
                   force=True)


tmd_i = [1]  # number of tmd_t
tmd_t = ['w']  # type of tmd_i

# Definition of time delta between two time window
td_i = [1,2]  # nomber of td_t
td_t = ['d']  # type of td_i

smallest_time = [6*3600, 24*3600] # 6 hours in seconds

sampling = [50]
log = [False, True]

args = list(itertools.product(*[tmd_i, tmd_t, td_i, td_t, smallest_time, sampling, log]))

output_dir = "./output"
today = datetime.datetime.now().strftime("%Y-%m-%dT%Hh%Mm%Ss")

for arg in tqdm(args, desc="List of arguments", position=0):
    if arg[5] == -1 and arg[6]:
        continue
        
    df = df_commits.copy()
    
    logging.info(f"Arg : {arg}")
    rapport = f"{arg} : "
    suffix=""
    if arg[5] > 0 :
        dfs = get_sampling(df, arg[5], log=arg[6])
        suffix=f"_{arg[5]}"
        if arg[6]:
            suffix += "log"
        else:
            suffix += 'uni'  
    else :
        dfs = df.copy()
    rows, lines, file_name, file_meta = from_commit_repo_by_week(df=dfs,
                                         tmd_i=arg[0], 
                                         tmd_t=arg[1], 
                                         td_i=arg[2], 
                                         td_t=arg[3], 
                                         smallest_time=arg[4], 
                                         output_dir=output_dir ,
                                         suffix=suffix)
    rapport += f" rows={rows}, lines={lines}, data={file_name}, meta={file_meta}\n"
    logging.info(f"Rapport = {rapport}")
    with open(os.path.join(output_dir, f"rapport_{today}.txt"), 'a+') as fd:
              fd.write(rapport)

List of arguments:   0%|          | 0/8 [00:00<?, ?it/s]

2022-07-22 16:05:43,070 INFO : Arg : (1, 'w', 1, 'd', 21600, 50, False)
2022-07-22 16:05:43,301 INFO : From 599881 to 99165 samples
2022-07-22 16:05:43,406 INFO : 51716 columns for the sparse data
2022-07-22 16:05:43,509 INFO : Latest commit found at 2022-04-21 10:20:59+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:07:50,529 INFO : 51715 features and 47948 rows for ./output/github_cyber_2022-07-22T16h07m49s__1w_spaced_1d_smallest_6.0_50uni.dat
2022-07-22 16:07:50,547 INFO : Rapport = (1, 'w', 1, 'd', 21600, 50, False) :  rows=47948, lines=51715, data=./output/github_cyber_2022-07-22T16h07m49s__1w_spaced_1d_smallest_6.0_50uni.dat, meta=./output/github_cyber_2022-07-22T16h07m49s__1w_spaced_1d_smallest_6.0_50uni.txt

2022-07-22 16:07:50,651 INFO : Arg : (1, 'w', 1, 'd', 21600, 50, True)
2022-07-22 16:07:50,938 INFO : From 599881 to 349190 samples
2022-07-22 16:07:51,261 INFO : 164584 columns for the sparse data
2022-07-22 16:07:51,651 INFO : Latest commit found at 2022-04-23 16:44:19+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:10:57,271 INFO : 164570 features and 96390 rows for ./output/github_cyber_2022-07-22T16h10m55s__1w_spaced_1d_smallest_6.0_50log.dat
2022-07-22 16:10:57,342 INFO : Rapport = (1, 'w', 1, 'd', 21600, 50, True) :  rows=96390, lines=164570, data=./output/github_cyber_2022-07-22T16h10m55s__1w_spaced_1d_smallest_6.0_50log.dat, meta=./output/github_cyber_2022-07-22T16h10m55s__1w_spaced_1d_smallest_6.0_50log.txt

2022-07-22 16:10:57,464 INFO : Arg : (1, 'w', 1, 'd', 86400, 50, False)
2022-07-22 16:10:57,689 INFO : From 599881 to 44083 samples
2022-07-22 16:10:57,737 INFO : 8400 columns for the sparse data
2022-07-22 16:10:57,785 INFO : Latest commit found at 2022-04-21 12:46:47+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:12:44,404 INFO : 8399 features and 30027 rows for ./output/github_cyber_2022-07-22T16h12m44s__1w_spaced_1d_smallest_24.0_50uni.dat
2022-07-22 16:12:44,415 INFO : Rapport = (1, 'w', 1, 'd', 86400, 50, False) :  rows=30027, lines=8399, data=./output/github_cyber_2022-07-22T16h12m44s__1w_spaced_1d_smallest_24.0_50uni.dat, meta=./output/github_cyber_2022-07-22T16h12m44s__1w_spaced_1d_smallest_24.0_50uni.txt

2022-07-22 16:12:44,506 INFO : Arg : (1, 'w', 1, 'd', 86400, 50, True)
2022-07-22 16:12:44,760 INFO : From 599881 to 349190 samples
2022-07-22 16:12:45,043 INFO : 41146 columns for the sparse data
2022-07-22 16:12:45,403 INFO : Latest commit found at 2022-04-23 16:44:19+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:15:47,923 INFO : 41145 features and 96390 rows for ./output/github_cyber_2022-07-22T16h15m46s__1w_spaced_1d_smallest_24.0_50log.dat
2022-07-22 16:15:47,969 INFO : Rapport = (1, 'w', 1, 'd', 86400, 50, True) :  rows=96390, lines=41145, data=./output/github_cyber_2022-07-22T16h15m46s__1w_spaced_1d_smallest_24.0_50log.dat, meta=./output/github_cyber_2022-07-22T16h15m46s__1w_spaced_1d_smallest_24.0_50log.txt

2022-07-22 16:15:48,058 INFO : Arg : (1, 'w', 2, 'd', 21600, 50, False)
2022-07-22 16:15:48,270 INFO : From 599881 to 40377 samples
2022-07-22 16:15:48,317 INFO : 30548 columns for the sparse data
2022-07-22 16:15:48,360 INFO : Latest commit found at 2022-04-21 12:46:47+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:16:37,621 INFO : 30538 features and 14860 rows for ./output/github_cyber_2022-07-22T16h16m37s__1w_spaced_2d_smallest_6.0_50uni.dat
2022-07-22 16:16:37,628 INFO : Rapport = (1, 'w', 2, 'd', 21600, 50, False) :  rows=14860, lines=30538, data=./output/github_cyber_2022-07-22T16h16m37s__1w_spaced_2d_smallest_6.0_50uni.dat, meta=./output/github_cyber_2022-07-22T16h16m37s__1w_spaced_2d_smallest_6.0_50uni.txt

2022-07-22 16:16:37,728 INFO : Arg : (1, 'w', 2, 'd', 21600, 50, True)
2022-07-22 16:16:37,989 INFO : From 599881 to 349190 samples
2022-07-22 16:16:38,299 INFO : 164584 columns for the sparse data
2022-07-22 16:16:38,659 INFO : Latest commit found at 2022-04-23 16:44:19+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:18:10,254 INFO : 164570 features and 48215 rows for ./output/github_cyber_2022-07-22T16h18m09s__1w_spaced_2d_smallest_6.0_50log.dat
2022-07-22 16:18:10,284 INFO : Rapport = (1, 'w', 2, 'd', 21600, 50, True) :  rows=48215, lines=164570, data=./output/github_cyber_2022-07-22T16h18m09s__1w_spaced_2d_smallest_6.0_50log.dat, meta=./output/github_cyber_2022-07-22T16h18m09s__1w_spaced_2d_smallest_6.0_50log.txt

2022-07-22 16:18:10,378 INFO : Arg : (1, 'w', 2, 'd', 86400, 50, False)
2022-07-22 16:18:10,593 INFO : From 599881 to 52714 samples
2022-07-22 16:18:10,648 INFO : 13426 columns for the sparse data
2022-07-22 16:18:10,704 INFO : Latest commit found at 2022-04-19 22:04:06+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:19:01,207 INFO : 13425 features and 15234 rows for ./output/github_cyber_2022-07-22T16h19m01s__1w_spaced_2d_smallest_24.0_50uni.dat
2022-07-22 16:19:01,219 INFO : Rapport = (1, 'w', 2, 'd', 86400, 50, False) :  rows=15234, lines=13425, data=./output/github_cyber_2022-07-22T16h19m01s__1w_spaced_2d_smallest_24.0_50uni.dat, meta=./output/github_cyber_2022-07-22T16h19m01s__1w_spaced_2d_smallest_24.0_50uni.txt

2022-07-22 16:19:01,326 INFO : Arg : (1, 'w', 2, 'd', 86400, 50, True)
2022-07-22 16:19:01,621 INFO : From 599881 to 349190 samples
2022-07-22 16:19:01,913 INFO : 41146 columns for the sparse data
2022-07-22 16:19:02,271 INFO : Latest commit found at 2022-04-23 16:44:19+00:00


Time window loop:   0%|          | 0/1 [00:00<?, ?it/s]

Repository loop:   0%|          | 0/50 [00:00<?, ?it/s]

2022-07-22 16:20:33,859 INFO : 41145 features and 48215 rows for ./output/github_cyber_2022-07-22T16h20m33s__1w_spaced_2d_smallest_24.0_50log.dat
2022-07-22 16:20:33,887 INFO : Rapport = (1, 'w', 2, 'd', 86400, 50, True) :  rows=48215, lines=41145, data=./output/github_cyber_2022-07-22T16h20m33s__1w_spaced_2d_smallest_24.0_50log.dat, meta=./output/github_cyber_2022-07-22T16h20m33s__1w_spaced_2d_smallest_24.0_50log.txt

