In [1]:
import pandas as pd
import numpy as np

In [2]:
abandoned_df = pd.read_csv('/Users/kaziamithasan/Desktop/Research/msr25/msr25-mining-challenge/code/abandoned_projects_with_labels.csv')

In [3]:
abandoned_df


Unnamed: 0,artifact_id,release_id,release_version,release_timestamp,interval,is_abandoned,label,time_gap
0,academy.compose.companion:multi-fab,academy.compose.companion:multi-fab:1.0,1.0,2021-01-05 21:01:38,,1,Start,
1,academy.compose.companion:multi-fab,academy.compose.companion:multi-fab:1.2,1.2,2021-01-07 20:44:22,1.988009,1,Active,1 days 23:42:44
2,academy.compose.companion:multi-fab,academy.compose.companion:multi-fab:1.2.3,1.2.3,2021-01-14 06:40:25,6.413924,1,Active,6 days 09:56:03
3,ae.teletronics.ejabberd:EjabberdXMLRPCClient,ae.teletronics.ejabberd:EjabberdXMLRPCClient:1...,1.0.2,2016-10-27 04:48:11,,1,Start,
4,ae.teletronics.ejabberd:EjabberdXMLRPCClient,ae.teletronics.ejabberd:EjabberdXMLRPCClient:1...,1.0.4,2017-03-12 08:29:30,136.153692,1,Active,136 days 03:41:19
...,...,...,...,...,...,...,...,...
2895137,zone.wmj:user-agent-util,zone.wmj:user-agent-util:1.2.10,1.2.10,2022-04-23 16:16:18,43.274039,1,Active,43 days 06:34:37
2895138,zone.wmj:user-agent-util,zone.wmj:user-agent-util:2.0.0,2.0.0,2022-06-16 07:45:05,53.644988,1,Active,53 days 15:28:47
2895139,zw.co.paynow:java-sdk,zw.co.paynow:java-sdk:1.0.0,1.0.0,2019-01-26 13:26:11,,1,Start,
2895140,zw.co.paynow:java-sdk,zw.co.paynow:java-sdk:1.1.0,1.1.0,2019-03-27 08:12:50,59.782396,1,Active,59 days 18:46:39


In [4]:
# Converting 'release_timestamp' to datetime
abandoned_df['release_timestamp'] = pd.to_datetime(abandoned_df['release_timestamp'])

# Step 1: Create the release rhythm for each artifact (list of release timestamps)
rhythm_df = abandoned_df.groupby('artifact_id')['release_timestamp'].apply(list).reset_index()
rhythm_df.columns = ['artifact_id', 'release_rhythm']

# Step 2: Calculate the speed for each artifact
artifact_speed = abandoned_df.groupby('artifact_id').agg(
    release_count=('release_id', 'size'),
    first_release=('release_timestamp', 'min'),
    last_release=('release_timestamp', 'max')
).reset_index()

# Calculate the time interval (d2 - d1) in days
artifact_speed['days_interval'] = (artifact_speed['last_release'] - artifact_speed['first_release']).dt.days

# Avoid division by zero (when there's only one release, set the interval to 1 day)
artifact_speed['days_interval'] = artifact_speed['days_interval'].replace(0, 1)

# Calculate speed (releases per day)
artifact_speed['speed'] = artifact_speed['release_count'] / artifact_speed['days_interval']

df_with_features = pd.merge(rhythm_df, artifact_speed[['artifact_id', 'release_count', 'speed']], on='artifact_id')
df_with_features


Unnamed: 0,artifact_id,release_rhythm,release_count,speed
0,academy.compose.companion:multi-fab,"[2021-01-05 21:01:38, 2021-01-07 20:44:22, 202...",3,0.375000
1,ae.teletronics.ejabberd:EjabberdXMLRPCClient,"[2016-10-27 04:48:11, 2017-03-12 08:29:30, 201...",4,0.028986
2,ae.teletronics.nlp:categorisation,"[2016-04-11 20:23:38, 2016-04-19 06:22:20, 201...",6,0.022388
3,ae.teletronics.nlp:entityextraction,"[2016-06-15 06:49:12, 2016-06-17 09:45:41, 201...",5,0.046729
4,ae.teletronics.nlp:language-detector,"[2016-02-16 10:54:43, 2016-02-22 11:07:05, 201...",6,0.086957
...,...,...,...,...
206763,zone.refactor.spring:hateoas,"[2019-10-22 05:17:14, 2019-11-20 16:29:03, 201...",6,0.142857
206764,zone.refactor.spring:validation,"[2019-10-26 09:03:16, 2019-11-25 17:21:59, 201...",5,0.161290
206765,zone.stefan.dev:geocode,"[2021-01-08 15:30:37, 2021-01-08 17:44:45]",2,2.000000
206766,zone.wmj:user-agent-util,"[2021-09-10 17:26:29, 2021-09-12 13:00:21, 202...",13,0.046763


In [5]:
df_with_features.describe()

Unnamed: 0,release_count,speed
count,206768.0,206768.0
mean,14.001886,0.305021
std,39.767212,1.925188
min,2.0,0.000697
25%,3.0,0.012821
50%,5.0,0.032258
75%,12.0,0.111111
max,2173.0,277.0


In [19]:
# Step 1: Define a function to segment the project lifecycle into phases
def calculate_phase_speeds(df):
    """
    Calculate speed metrics for each phase of a project's lifecycle.
    Phases are segmented into Early, Middle, and Late based on release count.
    """
    # Group by artifact_id to process each project individually
    phase_results = []
    for artifact_id, group in df.groupby('artifact_id'):
        # Sort by release_timestamp
        group = group.sort_values('release_timestamp').reset_index(drop=True)
        n_releases = len(group)
        
        if n_releases < 6:
            # Skip projects with too few releases to segment
            continue

        # Split into phases: early (first 25%), middle (next 50%), late (last 25%)
        split_early = int(n_releases * 0.25)
        split_middle = int(n_releases * 0.75)
        
        early_phase = group.iloc[:split_early]
        middle_phase = group.iloc[split_early:split_middle]
        late_phase = group.iloc[split_middle:]
        
        # Calculate phase durations and speeds
        def calculate_speed(phase):
            if len(phase) < 2:
                return 0  # No meaningful speed for single-release phases
            time_span = (phase['release_timestamp'].max() - phase['release_timestamp'].min()).total_seconds() / (24 * 3600)  # days
            time_span = max(time_span, 1)  # Avoid division by zero
            return len(phase) / time_span
        
        # Calculate speeds for each phase
        early_speed = calculate_speed(early_phase)
        middle_speed = calculate_speed(middle_phase)
        late_speed = calculate_speed(late_phase)

        # Append results, including phase changes
        phase_results.append({
            'artifact_id': artifact_id,
            'early_speed': early_speed,
            'middle_speed': middle_speed,
            'late_speed': late_speed,
            'early_to_middle_change': middle_speed - early_speed,
            'middle_to_late_change': late_speed - middle_speed,
            'early_count': len(early_phase),
            'middle_count': len(middle_phase),
            'late_count': len(late_phase),
        })
    
    return pd.DataFrame(phase_results)

# Step 2: Apply the function to calculate speeds for active and abandoned projects
phase_speeds = calculate_phase_speeds(abandoned_df)

phase_speeds

Unnamed: 0,artifact_id,early_speed,middle_speed,late_speed,early_to_middle_change,middle_to_late_change,early_count,middle_count,late_count
0,ae.teletronics.nlp:categorisation,0.000000,0.017120,0.030399,0.017120,0.013279,1,3,2
1,ae.teletronics.nlp:language-detector,0.000000,0.188030,1.898609,0.188030,1.710579,1,3,2
2,aero.t2s:mode-s,2.000000,0.009629,1.071841,-1.990371,1.062212,2,4,2
3,ai.active:webhook-sdk,0.068435,0.020388,0.020817,-0.048048,0.000430,2,4,2
4,ai.api.libai.speech:libai-speech-gcp,0.000000,0.032569,0.091088,0.032569,0.058519,1,4,2
...,...,...,...,...,...,...,...,...,...
100530,zone.gryphon:base-bom,0.254211,0.318985,0.128054,0.064773,-0.190931,15,30,16
100531,zone.gryphon:base-pom,0.713423,0.332931,0.155041,-0.380492,-0.177890,20,40,20
100532,zone.gryphon:core-poms,0.570294,0.268954,1.325136,-0.301340,1.056182,11,23,12
100533,zone.refactor.spring:hateoas,0.000000,0.426638,2.000000,0.426638,1.573362,1,3,2


In [15]:
# # Step 1: Calculate speed change
# phase_speeds['speed_change'] = phase_speeds['middle_speed'] - phase_speeds['late_speed']

# # Step 2: Filter artifacts with a slowdown
# # Define a slowdown threshold (e.g., speed change > 0)
# slowed_down_artifacts = phase_speeds[phase_speeds['speed_change'] > 0]

# # Optional: Add threshold to detect significant slowdowns
# threshold = 0.01  # Example: speed decreased by at least 0.01
# significantly_slowed_down = phase_speeds[phase_speeds['speed_change'] > threshold]

# significantly_slowed_down

In [20]:
# def categorize_artifacts(phase_speeds):
#     """
#     Categorize artifacts into lifecycle categories based on phase speeds.
#     Categories:
#         1. Slowed Down in the End: late_speed < 0.5 * middle_speed
#         2. Long Gaps in Between: middle_speed < 0.05
#         3. Normal: Speeds vary by less than 20% across phases.
#     """
#     def assign_category(row):
#         if row['late_speed'] < 0.5 * row['middle_speed']:
#             return 'Slowed Down in the End'
#         elif row['middle_speed'] < 0.05:
#             return 'Long Gaps in Between'
#         elif (abs(row['early_speed'] - row['middle_speed']) / max(row['middle_speed'], 1e-10) < 0.2 and
#               abs(row['middle_speed'] - row['late_speed']) / max(row['middle_speed'], 1e-10) < 0.2):
#             return 'Normal'
#         else:
#             return 'Other'

#     # Apply categorization
#     phase_speeds['category'] = phase_speeds.apply(assign_category, axis=1)
#     return phase_speeds
def categorize_artifacts(phase_speeds):
    """
    Categorize artifacts into lifecycle categories based on phase speeds.
    Categories:
        1. Slowed Down in the End: late_speed < 0.5 * middle_speed
        2. Long Gaps in Between: middle_speed < 0.5 * early_speed and middle_speed < 0.5 * late_speed
        3. Normal: Speeds vary by less than 20% across phases.
    """
    def assign_category(row):
        # Slowed Down in the End
        if row['late_speed'] < 0.5 * row['middle_speed']:
            return 'Slowed Down in the End'
        
        # Long Gaps in Between
        elif row['middle_speed'] < 0.5 * row['early_speed'] and row['middle_speed'] < 0.5 * row['late_speed']:
            return 'Long Gaps in Between'
        
        # # Normal
        # elif (abs(row['early_speed'] - row['middle_speed']) / max(row['middle_speed'], 1e-10) < 0.2 and
        #       abs(row['middle_speed'] - row['late_speed']) / max(row['middle_speed'], 1e-10) < 0.2):
        #     return 'Normal'
        
        # # Other cases
        # else:
        #     return 'Other'
        # Normal (including previously categorized as 'Other')
        else:
            return 'Normal'

    # Apply categorization
    phase_speeds['category'] = phase_speeds.apply(assign_category, axis=1)
    return phase_speeds

# Apply the function
categorized_artifacts = categorize_artifacts(phase_speeds)

# Display the categorized DataFrame
categorized_artifacts.head()


Unnamed: 0,artifact_id,early_speed,middle_speed,late_speed,early_to_middle_change,middle_to_late_change,early_count,middle_count,late_count,category
0,ae.teletronics.nlp:categorisation,0.0,0.01712,0.030399,0.01712,0.013279,1,3,2,Normal
1,ae.teletronics.nlp:language-detector,0.0,0.18803,1.898609,0.18803,1.710579,1,3,2,Normal
2,aero.t2s:mode-s,2.0,0.009629,1.071841,-1.990371,1.062212,2,4,2,Long Gaps in Between
3,ai.active:webhook-sdk,0.068435,0.020388,0.020817,-0.048048,0.00043,2,4,2,Normal
4,ai.api.libai.speech:libai-speech-gcp,0.0,0.032569,0.091088,0.032569,0.058519,1,4,2,Normal


In [21]:
categorized_artifacts.tail()


Unnamed: 0,artifact_id,early_speed,middle_speed,late_speed,early_to_middle_change,middle_to_late_change,early_count,middle_count,late_count,category
100530,zone.gryphon:base-bom,0.254211,0.318985,0.128054,0.064773,-0.190931,15,30,16,Slowed Down in the End
100531,zone.gryphon:base-pom,0.713423,0.332931,0.155041,-0.380492,-0.17789,20,40,20,Slowed Down in the End
100532,zone.gryphon:core-poms,0.570294,0.268954,1.325136,-0.30134,1.056182,11,23,12,Long Gaps in Between
100533,zone.refactor.spring:hateoas,0.0,0.426638,2.0,0.426638,1.573362,1,3,2,Normal
100534,zone.wmj:user-agent-util,0.145044,0.056644,0.034867,-0.088401,-0.021777,3,6,4,Normal


In [22]:
categorized_artifacts['category'].value_counts(normalize=True)
#abandoned

category
Normal                    0.559865
Slowed Down in the End    0.363585
Long Gaps in Between      0.076550
Name: proportion, dtype: float64

In [47]:
categorized_artifacts['artifact_id'].nunique()

134690

In [50]:
# Calculate value counts
category_counts = categorized_artifacts['category'].value_counts()

# Normalize to get proportions
category_proportions = category_counts / category_counts.sum()

# Display proportions with proper summation
print(category_proportions)


category
Normal                    0.477645
Slowed Down in the End    0.465216
Long Gaps in Between      0.057139
Name: count, dtype: float64


In [24]:
categorized_artifacts['category'].value_counts()

category
Long Gaps in Between      59085
Slowed Down in the End    48797
Other                     25328
Normal                     1480
Name: count, dtype: int64

In [54]:
categorized_artifacts

Unnamed: 0,artifact_id,early_speed,middle_speed,late_speed,early_to_middle_change,middle_to_late_change,early_count,middle_count,late_count,category
0,academy.compose.companion:multi-fab,0.000000,1.006032,0.000000,1.006032,-1.006032,0,2,1,Slowed Down in the End
1,ae.teletronics.ejabberd:EjabberdXMLRPCClient,0.000000,1.974812,0.000000,1.974812,-1.974812,1,2,1,Slowed Down in the End
2,ae.teletronics.nlp:categorisation,0.000000,0.017120,0.030399,0.017120,0.013279,1,3,2,Normal
3,ae.teletronics.nlp:entityextraction,0.000000,0.685839,0.079681,0.685839,-0.606159,1,2,2,Slowed Down in the End
4,ae.teletronics.nlp:language-detector,0.000000,0.188030,1.898609,0.188030,1.710579,1,3,2,Normal
...,...,...,...,...,...,...,...,...,...,...
206763,zone.refactor.spring:hateoas,0.000000,0.426638,2.000000,0.426638,1.573362,1,3,2,Normal
206764,zone.refactor.spring:validation,0.000000,1.930381,2.000000,1.930381,0.069619,1,2,2,Normal
206765,zone.stefan.dev:geocode,0.000000,0.000000,0.000000,0.000000,0.000000,0,1,1,Normal
206766,zone.wmj:user-agent-util,0.145044,0.056644,0.034867,-0.088401,-0.021777,3,6,4,Normal


In [17]:
#test = 'com.gaborpihaj:mtg4s-inventory_2.13'
artifact_id_to_inspect = "zone.gryphon:base-pom"
test = abandoned_df[abandoned_df['artifact_id'] == artifact_id_to_inspect]
test = test.sort_values(by='release_timestamp').reset_index(drop=True)
test

Unnamed: 0,artifact_id,release_id,release_version,release_timestamp,interval,is_abandoned,label,time_gap
0,zone.gryphon:base-pom,zone.gryphon:base-pom:0.1,0.1,2019-06-08 23:27:37,,1,Start,
1,zone.gryphon:base-pom,zone.gryphon:base-pom:0.2,0.2,2019-06-11 05:27:57,2.250231,1,Active,2 days 06:00:20
2,zone.gryphon:base-pom,zone.gryphon:base-pom:0.3,0.3,2019-06-11 05:38:07,0.007060,1,Active,0 days 00:10:10
3,zone.gryphon:base-pom,zone.gryphon:base-pom:0.4,0.4,2019-06-11 05:43:26,0.003692,1,Active,0 days 00:05:19
4,zone.gryphon:base-pom,zone.gryphon:base-pom:0.5,0.5,2019-06-12 05:36:30,0.995185,1,Active,0 days 23:53:04
...,...,...,...,...,...,...,...,...
75,zone.gryphon:base-pom,zone.gryphon:base-pom:0.20.89-f25271d,0.20.89-f25271d,2019-11-14 07:26:04,0.034583,1,Active,0 days 00:49:48
76,zone.gryphon:base-pom,zone.gryphon:base-pom:0.20.92-0adb93a,0.20.92-0adb93a,2020-02-19 06:34:25,96.964132,1,Active,96 days 23:08:21
77,zone.gryphon:base-pom,zone.gryphon:base-pom:0.20.93-c220945,0.20.93-c220945,2020-02-19 06:38:05,0.002546,1,Active,0 days 00:03:40
78,zone.gryphon:base-pom,zone.gryphon:base-pom:0.20.94-f7d782a,0.20.94-f7d782a,2020-02-26 06:44:29,7.004444,1,Active,7 days 00:06:24


In [103]:

# Convert 'release_timestamp' to datetime if not already
abandoned_df['release_timestamp'] = pd.to_datetime(abandoned_df['release_timestamp'])

# Calculate the duration and total releases per artifact
artifact_stats = abandoned_df.groupby('artifact_id').agg(
    first_release=('release_timestamp', 'min'),
    last_release=('release_timestamp', 'max'),
    total_releases=('release_id', 'count')
).reset_index()


In [104]:
artifact_stats

Unnamed: 0,artifact_id,first_release,last_release,total_releases
0,academy.compose.companion:multi-fab,2021-01-05 21:01:38,2021-01-14 06:40:25,3
1,ae.teletronics.ejabberd:EjabberdXMLRPCClient,2016-10-27 04:48:11,2017-03-14 08:38:12,4
2,ae.teletronics.nlp:categorisation,2016-04-11 20:23:38,2017-01-05 08:41:51,6
3,ae.teletronics.nlp:entityextraction,2016-06-15 06:49:12,2016-09-30 12:52:02,5
4,ae.teletronics.nlp:language-detector,2016-02-16 10:54:43,2016-04-26 06:41:12,6
...,...,...,...,...
206763,zone.refactor.spring:hateoas,2019-10-22 05:17:14,2019-12-03 07:59:12,6
206764,zone.refactor.spring:validation,2019-10-26 09:03:16,2019-11-26 20:53:27,5
206765,zone.stefan.dev:geocode,2021-01-08 15:30:37,2021-01-08 17:44:45,2
206766,zone.wmj:user-agent-util,2021-09-10 17:26:29,2022-06-16 07:45:05,13


In [105]:
artifact_stats['duration (days)'] = (artifact_stats['last_release'] - artifact_stats['first_release']).dt.days


In [106]:
artifact_stats


Unnamed: 0,artifact_id,first_release,last_release,total_releases,duration (days)
0,academy.compose.companion:multi-fab,2021-01-05 21:01:38,2021-01-14 06:40:25,3,8
1,ae.teletronics.ejabberd:EjabberdXMLRPCClient,2016-10-27 04:48:11,2017-03-14 08:38:12,4,138
2,ae.teletronics.nlp:categorisation,2016-04-11 20:23:38,2017-01-05 08:41:51,6,268
3,ae.teletronics.nlp:entityextraction,2016-06-15 06:49:12,2016-09-30 12:52:02,5,107
4,ae.teletronics.nlp:language-detector,2016-02-16 10:54:43,2016-04-26 06:41:12,6,69
...,...,...,...,...,...
206763,zone.refactor.spring:hateoas,2019-10-22 05:17:14,2019-12-03 07:59:12,6,42
206764,zone.refactor.spring:validation,2019-10-26 09:03:16,2019-11-26 20:53:27,5,31
206765,zone.stefan.dev:geocode,2021-01-08 15:30:37,2021-01-08 17:44:45,2,0
206766,zone.wmj:user-agent-util,2021-09-10 17:26:29,2022-06-16 07:45:05,13,278


In [107]:
artifact_stats.describe()

Unnamed: 0,first_release,last_release,total_releases,duration (days)
count,206768,206768,206768.0,206768.0
mean,2018-03-17 04:00:32.318405120,2019-07-02 21:50:25.871648512,14.001886,472.249434
min,2014-09-04 00:03:06,2014-09-04 20:14:58,2.0,0.0
25%,2016-03-09 06:59:16.750000128,2017-10-29 10:53:52.500000,3.0,53.0
50%,2018-02-16 17:30:56,2019-10-07 10:48:41,5.0,268.0
75%,2020-02-29 02:02:51.500000,2021-05-14 20:29:46,12.0,693.0
max,2022-09-03 13:37:49,2022-09-03 23:09:10,2173.0,2908.0
std,,,39.767212,558.317379


In [108]:
# Define duration categories
artifact_stats['duration_category'] = pd.cut(
    artifact_stats['duration (days)'],
    bins=[-1, 365, 730, np.inf],  # 1 year = 365 days, 2 years = 730 days
    labels=['<1 year', '1-2 years', '>2 years']
)

# Count artifacts in each category
duration_summary = artifact_stats['duration_category'].value_counts()


In [109]:
duration_summary

duration_category
<1 year      120146
>2 years      48354
1-2 years     38268
Name: count, dtype: int64

In [110]:
# # Define total release categories
# release_bins = [0, 50, 100, 300, np.inf]  # Bins for total releases
# release_labels = ['<50', '50-100', '100-300', '>300']

# # Categorize total releases
# artifact_stats['release_category'] = pd.cut(
#     artifact_stats['total_releases'], 
#     bins=release_bins, 
#     labels=release_labels
# )

# # Combine duration and release categories
# artifact_stats['combined_category'] = artifact_stats['duration_category'].astype(str) + ", " + artifact_stats['release_category'].astype(str)
# artifact_stats

# Define new bins and labels for total releases
release_bins = [0, 2, 5, 20, np.inf]  # New bins for total releases
release_labels = ['<2', '2-5', '5-20', '>20']

# Categorize total releases
artifact_stats['release_category'] = pd.cut(
    artifact_stats['total_releases'], 
    bins=release_bins, 
    labels=release_labels
)

# Combine duration and release categories
artifact_stats['combined_category'] = artifact_stats['duration_category'].astype(str) + ", " + artifact_stats['release_category'].astype(str)

# Display the modified DataFrame
artifact_stats


Unnamed: 0,artifact_id,first_release,last_release,total_releases,duration (days),duration_category,release_category,combined_category
0,academy.compose.companion:multi-fab,2021-01-05 21:01:38,2021-01-14 06:40:25,3,8,<1 year,2-5,"<1 year, 2-5"
1,ae.teletronics.ejabberd:EjabberdXMLRPCClient,2016-10-27 04:48:11,2017-03-14 08:38:12,4,138,<1 year,2-5,"<1 year, 2-5"
2,ae.teletronics.nlp:categorisation,2016-04-11 20:23:38,2017-01-05 08:41:51,6,268,<1 year,5-20,"<1 year, 5-20"
3,ae.teletronics.nlp:entityextraction,2016-06-15 06:49:12,2016-09-30 12:52:02,5,107,<1 year,2-5,"<1 year, 2-5"
4,ae.teletronics.nlp:language-detector,2016-02-16 10:54:43,2016-04-26 06:41:12,6,69,<1 year,5-20,"<1 year, 5-20"
...,...,...,...,...,...,...,...,...
206763,zone.refactor.spring:hateoas,2019-10-22 05:17:14,2019-12-03 07:59:12,6,42,<1 year,5-20,"<1 year, 5-20"
206764,zone.refactor.spring:validation,2019-10-26 09:03:16,2019-11-26 20:53:27,5,31,<1 year,2-5,"<1 year, 2-5"
206765,zone.stefan.dev:geocode,2021-01-08 15:30:37,2021-01-08 17:44:45,2,0,<1 year,<2,"<1 year, <2"
206766,zone.wmj:user-agent-util,2021-09-10 17:26:29,2022-06-16 07:45:05,13,278,<1 year,5-20,"<1 year, 5-20"


In [111]:
# Create a cross table
cross_table = pd.crosstab(
    artifact_stats['duration_category'], 
    artifact_stats['release_category']
)
cross_table

release_category,<2,2-5,5-20,>20
duration_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
<1 year,38817,44104,31995,5230
1-2 years,3298,9836,17918,7216
>2 years,2117,8061,20707,17469


In [112]:
np.random.seed(42)


In [113]:
# Randomly sample 50 data points from each category combination
sampled_data = artifact_stats.groupby(['duration_category', 'release_category']).apply(
    lambda x: x.sample(n=50, replace=False, random_state=42) if len(x) >= 50 else x
).reset_index(drop=True)

# Display sampled data
sampled_data

  sampled_data = artifact_stats.groupby(['duration_category', 'release_category']).apply(
  sampled_data = artifact_stats.groupby(['duration_category', 'release_category']).apply(


Unnamed: 0,artifact_id,first_release,last_release,total_releases,duration (days),duration_category,release_category,combined_category
0,com.gaborpihaj:mtg4s-inventory_2.13,2020-08-29 11:09:51,2020-09-02 20:13:01,2,4,<1 year,<2,"<1 year, <2"
1,com.github.frimtec:import-control-demo,2020-11-01 15:18:57,2020-12-19 09:12:34,2,47,<1 year,<2,"<1 year, <2"
2,com.acidmanic:installation,2020-04-11 08:46:29,2020-04-12 12:55:10,2,1,<1 year,<2,"<1 year, <2"
3,com.cloudimpl:error-lib,2020-07-16 23:06:43,2020-07-17 12:15:35,2,0,<1 year,<2,"<1 year, <2"
4,com.github.nhojpatrick.versions:nhojpatrick-ve...,2019-08-19 21:26:00,2019-09-08 20:28:57,2,19,<1 year,<2,"<1 year, <2"
...,...,...,...,...,...,...,...,...
595,com.cerner.beadledom:beadledom-parent,2017-01-27 23:05:32,2020-12-09 23:21:17,25,1412,>2 years,>20,">2 years, >20"
596,com.kumuluz.ee:kumuluzee-core,2015-05-27 05:25:02,2022-06-21 09:16:59,46,2582,>2 years,>20,">2 years, >20"
597,org.kuali.rice:rice-sampleapp,2014-09-23 22:36:16,2018-02-15 16:56:07,37,1240,>2 years,>20,">2 years, >20"
598,io.ktor:ktor-metrics-kotlinMultiplatform,2019-01-24 20:54:45,2022-03-14 14:01:27,36,1144,>2 years,>20,">2 years, >20"


In [114]:
sampled_data['combined_category'].value_counts()

combined_category
<1 year, <2        50
<1 year, 2-5       50
<1 year, 5-20      50
<1 year, >20       50
1-2 years, <2      50
1-2 years, 2-5     50
1-2 years, 5-20    50
1-2 years, >20     50
>2 years, <2       50
>2 years, 2-5      50
>2 years, 5-20     50
>2 years, >20      50
Name: count, dtype: int64

In [115]:
# Extract sampled artifact IDs
sampled_artifact_ids = sampled_data['artifact_id'].unique()

In [116]:
# Filter abandoned_df for sampled artifact IDs
filtered_release_history = abandoned_df[abandoned_df['artifact_id'].isin(sampled_artifact_ids)].copy()

# Calculate days since first release for each artifact
filtered_release_history['first_release'] = filtered_release_history.groupby('artifact_id')['release_timestamp'].transform('min')
filtered_release_history['days_since_first_release'] = (
    filtered_release_history['release_timestamp'] - filtered_release_history['first_release']
).dt.days


In [117]:
# import matplotlib.pyplot as plt

# # Iterate through each artifact and create separate plots
# for artifact_id in sampled_artifact_ids:
#     # Create a separate DataFrame for this artifact
#     artifact_df = filtered_release_history[filtered_release_history['artifact_id'] == artifact_id]
    
#     # Sort by days since first release
#     artifact_df = artifact_df.sort_values('days_since_first_release')
    
#     # Plot release history for this artifact
#     plt.figure(figsize=(10, 6))
#     plt.step(
#         artifact_df['days_since_first_release'], 
#         range(1, len(artifact_df) + 1),  # Cumulative number of releases
#         label=f"Artifact: {artifact_id}",
#         alpha=0.7
#     )
    
#     # Customize the plot
#     plt.title(f'Release History for {artifact_id}', fontsize=14)
#     plt.xlabel('Days Since First Release', fontsize=12)
#     plt.ylabel('Number of Releases', fontsize=12)
#     plt.legend(loc='upper left', fontsize='small', frameon=False)
#     plt.tight_layout()
#     plt.show()

In [118]:
# import matplotlib.pyplot as plt

# # Group sampled_data by combined_category
# for category, group in sampled_data.groupby('combined_category'):
#     # Get artifact IDs for this category
#     artifact_ids_in_category = group['artifact_id'].unique()
    
#     # Filter release history for these artifact IDs
#     category_release_history = filtered_release_history[filtered_release_history['artifact_id'].isin(artifact_ids_in_category)]
    
#     # Plot release history for each artifact in this category
#     fig, ax = plt.subplots(figsize=(12, 8))
#     for artifact_id, data in category_release_history.groupby('artifact_id'):
#         data = data.sort_values('days_since_first_release')
#         ax.step(
#             data['days_since_first_release'], 
#             range(1, len(data) + 1),  # Cumulative number of releases
#             label=f"{artifact_id}",
#             alpha=0.7
#         )
    
#     # Customize plot
#     ax.set_title(f'Release History for {category}', fontsize=14)
#     ax.set_xlabel('Days Since First Release', fontsize=12)
#     ax.set_ylabel('Number of Releases', fontsize=12)
#     ax.legend(loc='upper left', fontsize='small', ncol=2, frameon=False, title='Artifacts')
#     plt.tight_layout()
#     plt.show()


In [119]:
# # Step 1: Group sampled_data by combined_category
# categories = sampled_data['combined_category'].unique()

# # Step 2: Iterate over each combined_category
# for category in categories:
#     # Filter artifacts belonging to the current category
#     category_artifacts = sampled_data[sampled_data['combined_category'] == category]['artifact_id'].unique()
    
#     # Filter release history for these artifacts
#     category_release_history = abandoned_df[abandoned_df['artifact_id'].isin(category_artifacts)].copy()
    
#     # Calculate days since first release for these artifacts
#     category_release_history['first_release'] = category_release_history.groupby('artifact_id')['release_timestamp'].transform('min')
#     category_release_history['days_since_first_release'] = (
#         category_release_history['release_timestamp'] - category_release_history['first_release']
#     ).dt.days

#     # Step 3: Create separate plots for each artifact within this category
#     print(f"Generating plots for combined category: {category}")
    
#     for artifact_id in category_artifacts:
#         # Create a DataFrame for this artifact
#         artifact_df = category_release_history[category_release_history['artifact_id'] == artifact_id]
        
#         # Sort by days since first release
#         artifact_df = artifact_df.sort_values('days_since_first_release')
        
#         # Plot release history for this artifact
#         plt.figure(figsize=(10, 6))
#         plt.step(
#             artifact_df['days_since_first_release'], 
#             range(1, len(artifact_df) + 1),  # Cumulative number of releases
#             label=f"Artifact: {artifact_id}",
#             alpha=0.7
#         )
        
#         # Customize the plot
#         plt.title(f'Release History for {artifact_id} in {category}', fontsize=14)
#         plt.xlabel('Days Since First Release', fontsize=12)
#         plt.ylabel('Number of Releases', fontsize=12)
#         plt.legend(loc='upper left', fontsize='small', frameon=False)
#         plt.tight_layout()
#         plt.show()


In [120]:
# import os

# np.random.seed(42)

# # Step 2: Create output directory for plots
# output_dir = "/Users/kaziamithasan/Desktop/Research/msr25/msr25-mining-challenge/code/release/release_history_plots_abandoned"
# os.makedirs(output_dir, exist_ok=True)

# # Step 3: Group sampled_data by combined_category
# categories = sampled_data['combined_category'].unique()

# # Metadata for manual analysis
# manual_check_metadata = []

# for category in categories:
#     # Create a subdirectory for the category
#     category_dir = os.path.join(output_dir, category.replace(",", "_").replace(" ", "_"))
#     os.makedirs(category_dir, exist_ok=True)
    
#     # Filter artifacts belonging to the current category
#     category_artifacts = sampled_data[sampled_data['combined_category'] == category]['artifact_id'].unique()
    
#     # Filter release history for these artifacts
#     category_release_history = abandoned_df[abandoned_df['artifact_id'].isin(category_artifacts)].copy()
    
#     # Calculate days since first release for these artifacts
#     category_release_history['first_release'] = category_release_history.groupby('artifact_id')['release_timestamp'].transform('min')
#     category_release_history['days_since_first_release'] = (
#         category_release_history['release_timestamp'] - category_release_history['first_release']
#     ).dt.days

#     # Step 4: Create separate plots for each artifact within this category
#     print(f"Generating plots for combined category: {category}")
    
#     for artifact_id in category_artifacts:
#         # Create a DataFrame for this artifact
#         artifact_df = category_release_history[category_release_history['artifact_id'] == artifact_id]
        
#         # Sort by days since first release
#         artifact_df = artifact_df.sort_values('days_since_first_release')
        
#         # Plot release history for this artifact
#         plt.figure(figsize=(10, 6))
#         plt.step(
#             artifact_df['days_since_first_release'], 
#             range(1, len(artifact_df) + 1),  # Cumulative number of releases
#             label=f"Artifact: {artifact_id}",
#             alpha=0.7
#         )
        
#         # Customize the plot
#         plt.title(f'Release History for {artifact_id} in {category}', fontsize=14)
#         plt.xlabel('Days Since First Release', fontsize=12)
#         plt.ylabel('Number of Releases', fontsize=12)
#         plt.legend(loc='upper left', fontsize='small', frameon=False)
#         plt.tight_layout()
        
#         # Save the plot to the category folder
#         plot_filename = f"{artifact_id.replace(':', '_')}_release_history.png"
#         plot_path = os.path.join(category_dir, plot_filename)
#         plt.savefig(plot_path)
#         plt.close()
        
#         # Append metadata for manual analysis
#         manual_check_metadata.append({
#             "artifact_id": artifact_id,
#             "combined_category": category,
#             "plot_filename": plot_path
#         })

# # Step 5: Save metadata to a CSV file
# import pandas as pd

# metadata_df = pd.DataFrame(manual_check_metadata)
# metadata_file = os.path.join(output_dir, "manual_check_metadata.csv")
# metadata_df.to_csv(metadata_file, index=False)

# print(f"Plots and metadata saved to '{output_dir}' for manual analysis.")


In [121]:
metadata_df

Unnamed: 0,artifact_id,combined_category,plot_filename
0,com.gaborpihaj:mtg4s-inventory_2.13,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
1,com.github.frimtec:import-control-demo,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
2,com.acidmanic:installation,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
3,com.cloudimpl:error-lib,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
4,com.github.nhojpatrick.versions:nhojpatrick-ve...,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
...,...,...,...
595,com.cerner.beadledom:beadledom-parent,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
596,com.kumuluz.ee:kumuluzee-core,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
597,org.kuali.rice:rice-sampleapp,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
598,io.ktor:ktor-metrics-kotlinMultiplatform,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...


In [122]:
category_release_history

Unnamed: 0,artifact_id,release_id,release_version,release_timestamp,interval,is_abandoned,label,time_gap,first_release,days_since_first_release
3583,al.bluecryst:bluecrystal.deps.bc_g3,al.bluecryst:bluecrystal.deps.bc_g3:1.4.1,1.4.1,2016-06-02 11:35:48,,1,Start,,2016-06-02 11:35:48,0
3584,al.bluecryst:bluecrystal.deps.bc_g3,al.bluecryst:bluecrystal.deps.bc_g3:1.5.0,1.5.0,2016-06-18 19:50:21,16.343438,1,Active,16 days 08:14:33,2016-06-02 11:35:48,16
3585,al.bluecryst:bluecrystal.deps.bc_g3,al.bluecryst:bluecrystal.deps.bc_g3:1.5.1,1.5.1,2016-07-11 15:22:17,22.813843,1,Active,22 days 19:31:56,2016-06-02 11:35:48,39
3586,al.bluecryst:bluecrystal.deps.bc_g3,al.bluecryst:bluecrystal.deps.bc_g3:1.5.2,1.5.2,2016-07-11 16:53:42,0.063484,1,Active,0 days 01:31:25,2016-06-02 11:35:48,39
3587,al.bluecryst:bluecrystal.deps.bc_g3,al.bluecryst:bluecrystal.deps.bc_g3:1.5.3,1.5.3,2016-07-11 18:33:36,0.069375,1,Active,0 days 01:39:54,2016-06-02 11:35:48,39
...,...,...,...,...,...,...,...,...,...,...
2860045,software.amazon.awssdk:aws-ion-protocol,software.amazon.awssdk:aws-ion-protocol:2.16.100,2.16.100,2021-07-12 19:16:32,2.959294,1,Active,2 days 23:01:23,2018-11-13 23:02:15,971
2860046,software.amazon.awssdk:aws-ion-protocol,software.amazon.awssdk:aws-ion-protocol:2.16.101,2.16.101,2021-07-13 20:04:38,1.033403,1,Active,1 days 00:48:06,2018-11-13 23:02:15,972
2860047,software.amazon.awssdk:aws-ion-protocol,software.amazon.awssdk:aws-ion-protocol:2.16.102,2.16.102,2021-07-14 19:54:55,0.993252,1,Active,0 days 23:50:17,2018-11-13 23:02:15,973
2860048,software.amazon.awssdk:aws-ion-protocol,software.amazon.awssdk:aws-ion-protocol:2.16.103,2.16.103,2021-07-15 19:11:08,0.969595,1,Active,0 days 23:16:13,2018-11-13 23:02:15,974


In [123]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ensure reproducibility
np.random.seed(42)

# Output directory for plots
output_dir = "/Users/kaziamithasan/Desktop/Research/msr25/msr25-mining-challenge/code/release/release_history_plots_abandoned_v2"
os.makedirs(output_dir, exist_ok=True)

# Group sampled_data by combined_category
categories = sampled_data['combined_category'].unique()

# Metadata for manual analysis
manual_check_metadata = []

for category in categories:
    # Create a subdirectory for the category
    category_dir = os.path.join(output_dir, category.replace(",", "_").replace(" ", "_"))
    os.makedirs(category_dir, exist_ok=True)
    
    # Filter artifacts belonging to the current category
    category_artifacts = sampled_data[sampled_data['combined_category'] == category]['artifact_id'].unique()
    
    # Filter release history for these artifacts
    category_release_history = abandoned_df[abandoned_df['artifact_id'].isin(category_artifacts)].copy()
    
    # Step 1: Sort each artifact's data by release timestamp
    category_release_history = category_release_history.sort_values(by=['artifact_id', 'release_timestamp'])
    
    # Step 2: Calculate days since the first release for each artifact
    category_release_history['first_release'] = category_release_history.groupby('artifact_id')['release_timestamp'].transform('min')
    category_release_history['duration_since_first_release'] = (
        category_release_history['release_timestamp'] - category_release_history['first_release']
    ).dt.days

    # Step 3: Create plots for each artifact in this category
    print(f"Generating plots for combined category: {category}")
    
    for artifact_id in category_artifacts:
        # Filter the data for this specific artifact
        artifact_data = category_release_history[category_release_history['artifact_id'] == artifact_id]
        
        # Plotting
        plt.figure(figsize=(10, 6))
        plt.plot(
            artifact_data['duration_since_first_release'], 
            range(1, len(artifact_data) + 1),  # Cumulative release count
            marker='o', linestyle='-', alpha=0.8, label=artifact_id
        )
        plt.xlabel('Duration Since First Release (Days)', fontsize=12)
        plt.ylabel('Release Number', fontsize=12)
        plt.title(f'Release History of {artifact_id} in {category}', fontsize=14)
        plt.grid(True)
        plt.legend(loc='upper left', fontsize='small', frameon=False)
        plt.tight_layout()
        
        # Save the plot to the category folder
        plot_filename = f"{artifact_id.replace(':', '_')}_release_history.png"
        plot_path = os.path.join(category_dir, plot_filename)
        plt.savefig(plot_path)
        plt.close()
        
        # Append metadata for manual analysis
        manual_check_metadata.append({
            "artifact_id": artifact_id,
            "combined_category": category,
            "plot_filename": plot_path
        })

# Save metadata to a CSV file
metadata_df = pd.DataFrame(manual_check_metadata)
metadata_file = os.path.join(output_dir, "manual_check_metadata.csv")
metadata_df.to_csv(metadata_file, index=False)

print(f"Plots and metadata saved to '{output_dir}' for manual analysis.")


Generating plots for combined category: <1 year, <2
Generating plots for combined category: <1 year, 2-5
Generating plots for combined category: <1 year, 5-20
Generating plots for combined category: <1 year, >20
Generating plots for combined category: 1-2 years, <2
Generating plots for combined category: 1-2 years, 2-5
Generating plots for combined category: 1-2 years, 5-20
Generating plots for combined category: 1-2 years, >20
Generating plots for combined category: >2 years, <2
Generating plots for combined category: >2 years, 2-5
Generating plots for combined category: >2 years, 5-20
Generating plots for combined category: >2 years, >20
Plots and metadata saved to '/Users/kaziamithasan/Desktop/Research/msr25/msr25-mining-challenge/code/release/release_history_plots_abandoned_v2' for manual analysis.


In [124]:
metadata_df

Unnamed: 0,artifact_id,combined_category,plot_filename
0,com.gaborpihaj:mtg4s-inventory_2.13,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
1,com.github.frimtec:import-control-demo,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
2,com.acidmanic:installation,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
3,com.cloudimpl:error-lib,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
4,com.github.nhojpatrick.versions:nhojpatrick-ve...,"<1 year, <2",/Users/kaziamithasan/Desktop/Research/msr25/ms...
...,...,...,...
595,com.cerner.beadledom:beadledom-parent,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
596,com.kumuluz.ee:kumuluzee-core,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
597,org.kuali.rice:rice-sampleapp,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
598,io.ktor:ktor-metrics-kotlinMultiplatform,">2 years, >20",/Users/kaziamithasan/Desktop/Research/msr25/ms...
