In [18]:
import pandas as pd
# Load data
df = pd.read_csv(
    'git_log.csv',
    sep=',',
    names=['hash', 'release', 'message', 'timestamp', 'author'],
    encoding='utf-16',
    parse_dates=['timestamp'],
    on_bad_lines='skip'
)

In [19]:
# Define keywords for each change type with regex patterns to match at start of message
import re

change_types = {
    'build': r'^build\b',
    'ci': r'^ci\b',
    'docs': r'^docs\b',
    'feat': r'^feat\b',
    'fix': r'^fix\b',
    'perf': r'^perf\b',
    'refactor': r'^refactor\b',
    'test': r'^test\b',
    'release': r'^release\b',
    'revert': r'^revert\b',
    'chore': r'^(chore|chare)\b',
    'update': r'^update\b',
}

# Function to assign change type based on message content, case-insensitive
def extract_change_type(message):
    for change, pattern in change_types.items():
        if re.search(pattern, message, re.IGNORECASE):  # Case-insensitive match
            return change
    return 'Not Found'  # Return 'Not Found' if no keyword is found

# Apply function to create new column
df['change_type'] = df['message'].apply(extract_change_type)

# Display the DataFrame to check results
df.head()

Unnamed: 0,hash,release,message,timestamp,author,change_type
0,ee1d06c8fa1f0a6bf2e6ffb189c15266bbbd1d76,(tag: 19.0.0-rc.0),release: cut the v19.0.0-rc.0 release,2024-10-30 12:59:28-07:00,Alex Rickabaugh,release
1,48eac2320d66bca2b07d29f65b15636f4abadb50,(tag: 18.2.10),release: cut the v18.2.10 release,2024-10-30 12:03:32-07:00,Alex Rickabaugh,release
2,dcd16b43f31177a269317f7764ef38559b14fd66,,refactor(migrations): Make the explicit standa...,2024-10-29 15:04:20-07:00,Matthieu Riegler,refactor
3,3230d78c545de12bee74dd0527765b5d8e7d6a35,,refactor(core): introduce `ngServerMode` as gl...,2024-10-29 12:22:55+00:00,Alan Agius,refactor
4,616b411a6d94d3dbc3e072b91c1194466c0a1add,,fix(migrations): properly migrate output alias...,2024-10-29 15:59:01+01:00,Pawel Kozlowski,fix


In [20]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)

# Convert to UTC
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')

In [None]:
df['release'] = df['release'].ffill()

In [22]:
# drop 20 empty cells in Timestamp column
df.dropna(subset=['timestamp'], inplace=True)

In [23]:
#print where Not Found
df[df['change_type'] == 'Not Found']
# Drop rows where change_type is 'Not Found'
df = df[df['change_type'] != 'Not Found']


In [24]:
# Count the tags for each change type
change_type_counts = df['change_type'].value_counts()
change_type_counts

docs        11155
fix          9521
refactor     7456
build        6790
feat         3303
test         2227
ci           1839
chore        1146
release       934
revert        858
perf          540
update         32
Name: change_type, dtype: int64

In [25]:
# Remove duplicates based not on hash but on timestamp
df.drop_duplicates(subset='timestamp', keep='first', inplace=True)

In [None]:
#Drop everything from 2017-03
df = df[df['timestamp'] >= '2017-03-01']

In [31]:
# List of releases to remove
releases_to_remove = [
    '(tag: 8.0.0-beta.10)',
    '(tag: zone.js-0.10.1)',
    '(tag: ngcontainer_0.9.0)',
    'ngcontainer_0.5.0)',
    '(tag: ngcontainer_0.4.0)',
    '(tag: ngcontainer_0.3.0)',
    '(tag: ngcontainer_0.3.2)',
    '(tag: 6.0.0-beta.2)',
    '(tag: 6.0.0-beta.8)',
    '(tag: 5.1.0-beta.0)',
    '(tag: ngcontainer_0.3.3)',
    '(origin/4.3.x)',
    '(tag: patch_sync)',
    '(origin/5.2x)'
]

# Filter the DataFrame to remove rows with specified releases
df = df[~df['release'].isin(releases_to_remove)]

In [34]:
#check if (origin/5.2x) is removed
df[df['release'] == '(origin/5.2x)']

Unnamed: 0,hash,release,message,timestamp,author,change_type


In [33]:
# CSV export
df.to_csv('git_log_processed.csv', index=False)