In [49]:
import pandas as pd
# Load data
df = pd.read_csv(
    'git_log.csv',
    sep=',',
    names=['hash', 'release', 'message', 'timestamp', 'author'],
    encoding='utf-16',
    parse_dates=['timestamp'],
    on_bad_lines='skip'
)

In [None]:
# Define keywords for each change type with regex patterns to match at start of message
import re
#[17.3.x]: build: follow-up fixes for AIO light...	

change_types = {
    'build': r'^build\b',
    'ci': r'^ci\b',
    'docs': r'^docs\b',
    'feat': r'^feat\b',
    'fix': r'^fix\b',
    'perf': r'^perf\b',
    'refactor': r'^refactor\b',
    'test': r'^test\b',
    'release': r'^release\b',
    'revert': r'^revert\b',
    'chore': r'^(chore|chare)\b',
    'update': r'^update\b',
}

# Function to assign change type based on message content, case-insensitive
def extract_change_type(message):
    for change, pattern in change_types.items():
        if re.search(pattern, message, re.IGNORECASE):  # Case-insensitive match
            return change
    return 'Not Found'  # Return 'Not Found' if no keyword is found

# Apply function to create new column
df['change_type'] = df['message'].apply(extract_change_type)

# Display the DataFrame to check results
df.head()

Unnamed: 0,hash,release,message,timestamp,author,change_type
0,ee1d06c8fa1f0a6bf2e6ffb189c15266bbbd1d76,(tag: 19.0.0-rc.0),release: cut the v19.0.0-rc.0 release,2024-10-30 19:59:28+00:00,Alex Rickabaugh,release
1,48eac2320d66bca2b07d29f65b15636f4abadb50,(tag: 18.2.10),release: cut the v18.2.10 release,2024-10-30 19:03:32+00:00,Alex Rickabaugh,release
2,dcd16b43f31177a269317f7764ef38559b14fd66,(tag: 18.2.10),refactor(migrations): Make the explicit standa...,2024-10-29 22:04:20+00:00,Matthieu Riegler,refactor
3,3230d78c545de12bee74dd0527765b5d8e7d6a35,(tag: 18.2.10),refactor(core): introduce `ngServerMode` as gl...,2024-10-29 12:22:55+00:00,Alan Agius,refactor
4,616b411a6d94d3dbc3e072b91c1194466c0a1add,(tag: 18.2.10),fix(migrations): properly migrate output alias...,2024-10-29 14:59:01+00:00,Pawel Kozlowski,fix


In [56]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)

# Convert to UTC
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')

In [57]:
df['release'] = df['release'].ffill()

In [58]:
# drop 20 empty cells in Timestamp column
df.dropna(subset=['timestamp'], inplace=True)

In [42]:
#print where Not Found
df[df['change_type'] == 'Not Found']
# Drop rows where change_type is 'Not Found'
df = df[df['change_type'] != 'Not Found']


In [44]:
# Remove duplicates based not on hash but on timestamp
df.drop_duplicates(subset='timestamp', keep='first', inplace=True)

In [60]:
#Drop everything from 2017-03
df = df[df['timestamp'] >= '2017-03-01']

In [62]:
# Count the tags for each change type
change_type_counts = df['change_type'].value_counts()
change_type_counts

docs         10442
fix           7594
build         6486
refactor      6454
feat          2106
test          2063
ci            1762
release        930
revert         767
perf           484
Not Found      261
update           8
chore            2
Name: change_type, dtype: int64

In [46]:
# List of releases to remove
releases_to_remove = [
    '(tag: 8.0.0-beta.10)',
    '(tag: zone.js-0.10.1)',
    '(tag: ngcontainer_0.9.0)',
    'ngcontainer_0.5.0)',
    '(tag: ngcontainer_0.4.0)',
    '(tag: ngcontainer_0.3.0)',
    '(tag: ngcontainer_0.3.2)',
    '(tag: 6.0.0-beta.2)',
    '(tag: 6.0.0-beta.8)',
    '(tag: 5.1.0-beta.0)',
    '(tag: ngcontainer_0.3.3)',
    '(origin/4.3.x)',
    '(tag: patch_sync)',
    '(origin/5.2x)'
]

# Filter the DataFrame to remove rows with specified releases
df = df[~df['release'].isin(releases_to_remove)]

In [61]:
#Show lines where "Not Found"
df[df['change_type'] == 'Not Found']

Unnamed: 0,hash,release,message,timestamp,author,change_type
2012,6d95131d84f27dae56b5fbe47a5e85b3f1bad5be,(tag: 18.0.0-rc.3),Supply chain attack demo for Google VRP,2024-05-20 10:56:56+00:00,ghost,Not Found
2820,2ad5dcf3f02ef1ac4bf894ccc1e73ded301cb8b9,(tag: 17.3.2),[17.3.x]: build: follow-up fixes for AIO light...,2024-03-27 14:46:46+00:00,Paul Gschwendtner,Not Found
3652,5cd00f167c4209e90bd37cc6b78e7c22570d861b,(tag: 17.1.2),InMemoryBackendConfigArgs.passThruUnknownUrl: ...,2023-10-18 20:05:13+00:00,Dwouglas Mhagnum,Not Found
3653,c213a4e15a594ff141cf312ad301128e7ed4127c,(tag: 17.1.2),InMemoryBackendConfigArgs.passThruUnknownUrl: ...,2023-10-18 20:05:13+00:00,Dwouglas Mhagnum,Not Found
9079,6ade57cfddf50c3b617f0cdc1400976a9dd396a7,(tag: 15.1.0-next.3),fixup! docs: release notes for the v15.0.4 rel...,2022-12-14 19:17:06+00:00,Jessica Janiuk,Not Found
...,...,...,...,...,...,...
39251,26efa3a25ced4b597deb0349c504f53985d18695,(tag: 4.0.0-rc.4),style(aio): tidy up SCSS files,2017-03-09 16:30:26+00:00,Peter Bacon Darwin,Not Found
39304,0e9277b4c33fddbc50a2f1bbb03c7da9e586491c,(tag: 4.0.0-rc.3),style(aio): change mock value for consistency,2017-03-01 13:16:38+00:00,Georgios Kalpakas,Not Found
39321,ad3b44aef736c3134018456f747d8c54cef842a5,(tag: 4.0.0-rc.3),RendererV2 -> Renderer2 rename (#14998),2017-03-08 00:36:12+00:00,Tobias Bosch,Not Found
39327,b017fbe48effd67e58c55b23da80b9d7825b6a93,(tag: 4.0.0-rc.3),style(aio): rename local variable,2017-03-06 14:17:49+00:00,Peter Bacon Darwin,Not Found


In [47]:
# CSV export
df.to_csv('git_log_processed.csv', index=False)