In [129]:
import pandas as pd
# Load data
df = pd.read_csv(
    'git_log.csv',
    sep=',',
    names=['hash', 'release', 'message', 'timestamp', 'author'],
    encoding='utf-16',
    parse_dates=['timestamp'],
    on_bad_lines='skip'
)
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)

# Convert to UTC
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')

#Drop everything from 2017-03
df = df[df['timestamp'] >= '2017-03-01']
# Remove duplicates based not on hash but on timestamp and prefer those that don't have release
df = df.sort_values('release').drop_duplicates('timestamp', keep='first')


In [130]:
# Define keywords for each change type with regex patterns to match at start of message
import re

change_types = {
    'build': r'^build\b',
    'ci': r'^ci\b',
    'docs': r'^docs\b',
    'feat': r'^feat\b',
    'fix': r'^fix\b',
    'perf': r'^perf\b',
    'refactor': r'^refactor\b',
    'test': r'^test\b',
    'release': r'^release\b',
    'revert': r'^revert\b',
    'chore': r'^(chore|chare)\b',
    'update': r'^update\b',
    'style': r'^style\b',
}

# Function to assign change type based on message content, case-insensitive
def extract_change_type(message):
    for change, pattern in change_types.items():
        if re.search(pattern, message, re.IGNORECASE):  # Case-insensitive match
            return change
    return 'Not Found'  # Return 'Not Found' if no keyword is found

# Apply function to create new column
df['change_type'] = df['message'].apply(extract_change_type)

# Display the DataFrame to check results
df.head()

Unnamed: 0,hash,release,message,timestamp,author,change_type
19981,7ca17a624988bdcf403f45827feffb908b4c3379,(origin/10.1.x),build(docs-infra): upgrade cli command docs so...,2020-10-21 09:20:37+00:00,George Kalpakas,build
37213,dfe2bad6633b9231f97d5a38e946c389c9f4cfbb,(origin/4.3.x),build: Add GitHub scripts for rebasing PRs (#1...,2017-07-26 19:37:14+00:00,Mis╠îko Hevery,build
35269,5c89d6bffab52176e4afdfff1b71645cbba9bd47,(origin/5.2x),feat(core): support metadata reflection for na...,2018-02-20 06:34:21+00:00,Trotyl,feat
11512,4a7e7242382e069b487ed89cdc42e5368af0c007,(origin/image-directive-13.3.x),"refactor(compiler-cli): fix ""for to"" typo in c...",2022-05-23 20:54:54+00:00,dario-piotrowicz,refactor
21757,345940bbc1b726c52dc9966801bd32dd051c82cb,(tag: 10.0.0),release: cut the v10.0.0 release,2020-06-24 18:46:57+00:00,Misko Hevery,release


In [131]:
change_types = {
    'build': r'build\b',
    'ci': r'ci\b',
    'docs': r'docs\b',
    'feat': r'feat\b',
    'fix': r'fix\b',
    'perf': r'perf\b',
    'refactor': r'refactor\b',
    'test': r'test\b',
    'release': r'release\b',
    'revert': r'revert\b',
    'chore': r'(chore|chare)\b',
    'update': r'update\b',
    'style': r'style\b',
}
df['change_type'] = df['message'].apply(extract_change_type)
df[df['change_type'] == 'Not Found']

Unnamed: 0,hash,release,message,timestamp,author,change_type
2012,6d95131d84f27dae56b5fbe47a5e85b3f1bad5be,,Supply chain attack demo for Google VRP,2024-05-20 10:56:56+00:00,ghost,Not Found
11602,a0be043f8164b10e1d6516c88674f8d4cd0a67f6,,reactor(compiler-cli): account for babel types...,2022-05-20 14:29:56+00:00,Paul Gschwendtner,Not Found
18526,d1c954e25ccc77d0ef88c84de32cd2daab847b45,,Remove myself from language-service reviews (#...,2021-01-21 16:03:47+00:00,hafiz,Not Found
20162,23fc2b43ac7b357386a52f5b3dcf51726fc1954e,,updated sajee's profile (#39019),2020-09-28 10:42:01+00:00,sajeetharan,Not Found
24697,98a96608a687d5fbb95497e2d39921f8d4baaee4,,Initial commit for Angular DevTools,2020-01-27 18:40:18+00:00,mgechev,Not Found
37543,a4fae8c405da407a6b8414d0a6d0a680a6b7b877,,aio: debounce search and delay index building ...,2017-07-20 16:51:40+00:00,Pete Bacon Darwin,Not Found
38347,966eb2fbd0e270d73d53101f2e61f484e190d4d6,,aio: add h1 title to floating table of content...,2017-05-24 20:09:55+00:00,Pete Bacon Darwin,Not Found
38864,cc1ed77dd85afb03a381932cfaa0baf5feb22eeb,,consolidated and moved api-list scss,2017-04-12 02:37:03+00:00,Stefanie Fluin,Not Found
38865,a6545ddd4d4670e34c1542f244497c116f92f227,,filetree and subsection edits,2017-04-12 02:02:29+00:00,Stefanie Fluin,Not Found
38867,ad9a3a2d3bbb1e99cc7becf2f79469c614a9884e,,features page and code/table fixes,2017-04-11 20:24:42+00:00,Stefanie Fluin,Not Found


In [132]:
# sort by timestamp descendin
df = df.sort_values(by='timestamp', ascending=False)

In [133]:
df['release'] = df['release'].ffill()

In [134]:
# drop 20 empty cells in Timestamp column
df.dropna(subset=['timestamp'], inplace=True)

In [135]:
#print where Not Found
#df[df['change_type'] == 'Not Found']
# Drop rows where change_type is 'Not Found'
#df = df[df['change_type'] != 'Not Found']


In [136]:
# Count the tags for each change type
change_type_counts = df['change_type'].value_counts()
change_type_counts

docs         6484
fix          4819
build        4375
refactor     3933
feat         1714
test         1122
ci           1032
release       873
perf          341
style          75
Not Found      11
update          8
revert          3
chore           2
Name: change_type, dtype: int64

In [137]:
# List of releases to remove
releases_to_remove = [
    '(tag: 8.0.0-beta.10)',
    '(tag: zone.js-0.10.1)',
    '(tag: ngcontainer_0.9.0)',
    'ngcontainer_0.5.0)',
    '(tag: ngcontainer_0.4.0)',
    '(tag: ngcontainer_0.3.0)',
    '(tag: ngcontainer_0.3.2)',
    '(tag: 6.0.0-beta.2)',
    '(tag: 6.0.0-beta.8)',
    '(tag: 5.1.0-beta.0)',
    '(tag: ngcontainer_0.3.3)',
    '(origin/4.3.x)',
    '(tag: patch_sync)',
    '(origin/5.2x)'
]

# Filter the DataFrame to remove rows with specified releases
#df = df[~df['release'].isin(releases_to_remove)]

In [138]:
#remove from "release" -next.10 and simmilar so anything after "-" but keep the ) at the end
df['release'] = df['release'].str.replace(r'-.*', ')')

  df['release'] = df['release'].str.replace(r'-.*', ')')


In [139]:
# CSV export
df.to_csv('git_log_processed.csv', index=False)