This script performs parsing of the aggregated git commit logs for Java projects mentioned [here.](https://github.com/Semiu/java-codesecurity/blob/main/data-scraping-curating/Generating%20commit%20history%20log%20for%20each%20of%20the%20projects.ipynb)

In [1]:
#Import the required libraries
import pandas as pd
import os
import re

In [3]:
#Initialize a commit dict 
def get_commit():
    commit = {}
    return commit

In [4]:
#This function parses commits given the output from readline op
#This function is based on the following code: https://github.com/johnkchiu/GitLogParser/blob/master/gitLogParser.py
def parseCommit(commitLines):
    #Initialize the list to store commit dictionaries
    commits = []
    # dict to store commit data
    commit = get_commit()
    # iterate lines and save
    for nextLine in commitLines:
        # ignore empty lines
        if nextLine == '' or nextLine == '\n':
            pass
        # commit xxxx
        elif bool(re.match('commit', nextLine, re.IGNORECASE)):
            ## new commit, so re-initialize
            if len(commit) != 0:
                #store the previous commit
                commits.append(commit)
                commit = get_commit()
            #store the hash... though i dont think we need this
            commit['hash']= re.match('commit (.*)', nextLine, re.IGNORECASE).group(1)
        #we dont fetch merges
        # Merge: xxxx xxxx
        elif bool(re.match('merge:', nextLine, re.IGNORECASE)):
            pass
        #fetch authors
        elif bool(re.match('author:', nextLine, re.IGNORECASE)):
            # Author: xxxx <xxxx@xxxx.com>
            m = re.compile('Author: (.*) <(.*)>').match(nextLine)
            commit['author'] = m.group(1)
            commit['email'] = m.group(2)
        elif bool(re.match('date:', nextLine, re.IGNORECASE)):
            # Date: xxx
            commit['date'] = nextLine.split('   ')[-1][:-1]
        elif bool(re.match('    ', nextLine, re.IGNORECASE)):
            # (4 empty spaces)
            if commit.get('message') is None:
                commit['message'] = nextLine
            else:
                commit['message'] += nextLine
        # changes to files        
        elif bool(re.match('[-0-9]*(\t)[-0-9]*(\t)*', nextLine, re.IGNORECASE)):
            try:
                addition,subtraction,file_name=nextLine.strip().split('\t')
            #if this happens then that meant there was an error in
            #output of the file and is just a
            #continuation of the message of the commit
            except:
                commit['message'] += nextLine
                continue
            #they put - line diff couldnt be computed    
            addition=int(addition.replace('-','0'))
            subtraction=int(subtraction.replace('-','0'))
            if commit.get('files') is None:
                commit['files']=[]
            commit['files'].append([addition,subtraction,file_name])
        else:
            commit['message'] += nextLine
            #print(commit)
            #print ('ERROR: Unexpected Line: ' + nextLine)
    #if there is only one commit
    if len(commits)==0 and commit.get('hash') is not None:
        commits.append(commit)
        
    return commits

In [5]:
#File path
COMMIT_PATH = 'F:\\Dataset\\githubrepo\\Java\\'

In [9]:
#Convert the whole commits to csv and save to local machine
commits_list = parseCommit(open(COMMIT_PATH+'java_raw_commits.txt', encoding='utf-8', errors='ignore').readlines())
#To dataframe
commit_dataFrame = pd.DataFrame(data = commits_list)
#Save as CSV
commit_dataFrame.to_csv(COMMIT_PATH+'full_commit_java.csv', encoding ='utf-8')

In [7]:
commits_list

[{'hash': 'd72820b1d0547874071533f2d5c73047087dc935',
  'author': 'CyC2018',
  'email': 'zhengyc101@163.com',
  'date': 'Fri Nov 20 01:55:16 2020 +0800',
  'message': '    auto commit\n'},
 {'hash': '68a522b872e1876886f54d2916caf71b970e209e',
  'author': 'CyC2018',
  'email': 'zhengyc101@163.com',
  'date': 'Fri Nov 20 01:43:00 2020 +0800',
  'message': '    auto commit\n'},
 {'hash': 'efc0bcd1d6199448c323549fdc45310750b48c85',
  'author': 'CyC2018',
  'email': 'zhengyc101@163.com',
  'date': 'Fri Nov 20 01:42:38 2020 +0800',
  'message': '    Merge pull request #1011 from qizhengzhong/master\n    \n    文本小错误更改\n'},
 {'hash': '5ee7c7c750e9ecffaea12f97d5b05b2445e7c007',
  'author': 'Chris Qi',
  'email': 'chris.qi@aciworldwide.com',
  'date': 'Wed Nov 18 20:24:58 2020 +0000',
  'message': '    Small content update\n'},
 {'hash': '600256bde2721df27b1d3b0856cc8269dc70f8db',
  'author': 'Chris Qi',
  'email': 'chris.qi@aciworldwide.com',
  'date': 'Wed Nov 18 20:22:37 2020 +0000',
  'messa

In [10]:
commit_dataFrame.head()

Unnamed: 0,hash,author,email,date,message
0,d72820b1d0547874071533f2d5c73047087dc935,CyC2018,zhengyc101@163.com,Fri Nov 20 01:55:16 2020 +0800,auto commit\n
1,68a522b872e1876886f54d2916caf71b970e209e,CyC2018,zhengyc101@163.com,Fri Nov 20 01:43:00 2020 +0800,auto commit\n
2,efc0bcd1d6199448c323549fdc45310750b48c85,CyC2018,zhengyc101@163.com,Fri Nov 20 01:42:38 2020 +0800,Merge pull request #1011 from qizhengzhong...
3,5ee7c7c750e9ecffaea12f97d5b05b2445e7c007,Chris Qi,chris.qi@aciworldwide.com,Wed Nov 18 20:24:58 2020 +0000,Small content update\n
4,600256bde2721df27b1d3b0856cc8269dc70f8db,Chris Qi,chris.qi@aciworldwide.com,Wed Nov 18 20:22:37 2020 +0000,Small content update\n


In [11]:
commit_dataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541607 entries, 0 to 541606
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   hash     541607 non-null  object
 1   author   541607 non-null  object
 2   email    541607 non-null  object
 3   date     541607 non-null  object
 4   message  541177 non-null  object
dtypes: object(5)
memory usage: 20.7+ MB


In [17]:
#Extracts relevant information (hash and message) from the parsed commits
def get_information(commits):
    #obj for storing all the info
    info={}
      #hash
    info['sha']=[]
    #commit message
    info['message']=[]
    for commit in commits:
        info['sha'].append(commit['hash'])
        try:
            msgs=commit['message'].replace(',',' ')
            msgs=msgs.replace('\n',' ')
            info['message'].append(msgs)
        except:
            info['message'].append('')
    return info

In [18]:
#Call the get_information (commits_list, from parseCommit function is the argument)
info = get_information(commits_list)

In [14]:
info

{'message': ['    auto commit ',
  '    auto commit ',
  '    Merge pull request #1011 from qizhengzhong/master          文本小错误更改 ',
  '    Small content update ',
  '    Small content update ',
  "    Merge branch 'master' of github.com:CyC2018/CS-Notes ",
  '    auto commit ',
  '    auto commit ',
  '    Merge pull request #1017 from Xunzhuo/patch-4          修复ArrayList扩容机制的问题 ',
  '    修复ArrayList扩容机制的问题          对于奇数Old Capacity，右移操作：new Capacity = 1.5*Old Capacity - 0.5 ',
  '    auto commit ',
  '    Merge pull request #1015 from Xunzhuo/patch-1          补充switch的使用 ',
  "    Merge branch 'master' of github.com:CyC2018/CS-Notes ",
  '    auto commit ',
  '    Merge pull request #1016 from Xunzhuo/patch-2          fix markdown  errors ',
  '    auto commit ',
  '    fix markdown  errors ',
  '    补充switch的使用 ',
  '    auto commit ',
  '    auto commit ',
  '    auto commit ',
  "    Merge branch 'master' of github.com:CyC2018/CS-Notes ",
  '    auto commit ',
  '    auto commit ',

In [21]:
#To dataframe
info_dataFrame = pd.DataFrame(data = info)
#Save as CSV
info_dataFrame.to_csv(COMMIT_PATH+'java_commit_info.csv', encoding ='utf-8', index=False)

In [19]:
#To dataframe
info_dataFrame = pd.DataFrame(data = info)

In [20]:
info_dataFrame.head()

Unnamed: 0,sha,message
0,d72820b1d0547874071533f2d5c73047087dc935,auto commit
1,68a522b872e1876886f54d2916caf71b970e209e,auto commit
2,efc0bcd1d6199448c323549fdc45310750b48c85,Merge pull request #1011 from qizhengzhong...
3,5ee7c7c750e9ecffaea12f97d5b05b2445e7c007,Small content update
4,600256bde2721df27b1d3b0856cc8269dc70f8db,Small content update
