In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_columns', None)   
pd.set_option('display.width', None)         

artifacts = pd.read_csv("/Users/kaziamithasan/Desktop/Research/msr25/msr_fulldata/artifacts.csv")
artifact_release = pd.read_csv("/Users/kaziamithasan/Desktop/Research/msr25/msr_fulldata/artifact_release.csv")
release = pd.read_csv("/Users/kaziamithasan/Desktop/Research/msr25/msr_fulldata/releases.csv")
added_values = pd.read_csv("/Users/kaziamithasan/Desktop/Research/msr25/msr_fulldata/added_values.csv")
dependency = pd.read_csv("/Users/kaziamithasan/Desktop/Research/msr25/msr_fulldata/dependencies.csv")

In [2]:
release
release['release_timestamp'] = pd.to_datetime(release['release_timestamp'], unit='ms')
release

Unnamed: 0,release_id,release_version,release_timestamp
0,org.wso2.carbon.identity.framework:org.wso2.carbon.identity.cors.mgt.core:5.20.111,5.20.111,2021-07-13 03:50:42
1,org.apache.camel.quarkus:camel-quarkus-kotlin-parent:1.0.0-M4,1.0.0-M4,2020-03-03 12:52:23
2,org.apache.camel.quarkus:camel-quarkus-kotlin-parent:1.0.0-M3,1.0.0-M3,2020-01-24 10:17:09
3,org.wso2.carbon.identity.framework:org.wso2.carbon.identity.cors.mgt.core:5.20.113,5.20.113,2021-07-13 12:33:00
4,org.wso2.carbon.identity.framework:org.wso2.carbon.identity.cors.mgt.core:5.20.112,5.20.112,2021-07-13 10:09:05
...,...,...,...
14459134,aws.sdk.kotlin:apigateway-jvm:1.2.10,1.2.10,2024-05-10 18:58:50
14459135,io.quarkiverse.pact:quarkus-pact-parent:1.4.2,1.4.2,2024-06-19 09:05:05
14459136,org.http4k:http4k-connect-google-analytics-ga4-fake:5.21.0.0,5.21.0.0,2024-08-20 14:42:53
14459137,org.apache.servicecomb:servicecomb-governance:3.1.1,3.1.1,2024-05-11 08:28:38


In [3]:
# Merge artifacts with their releases
artifact_releases = pd.merge(artifact_release, release, on='release_id', how='inner')
artifact_releases = pd.merge(artifact_releases, artifacts[['artifact_id']], on='artifact_id', how='inner')


In [4]:
# Convert release_timestamp to datetime
artifact_releases['release_timestamp'] = pd.to_datetime(artifact_releases['release_timestamp'])


In [5]:
from datetime import datetime, timedelta

# 1. Identify "Unmaintained" Artifacts
# Calculate the current date and set a threshold (e.g., 2 years) for unmaintained classification
two_years_ago = datetime.now() - timedelta(days=2*365)  # Adjust threshold as needed


In [6]:
# Find the last release date for each artifact
unmaintained_artifacts = artifact_releases.groupby('artifact_id').agg(
    last_release=('release_timestamp', 'max')
).reset_index()
unmaintained_artifacts

Unnamed: 0,artifact_id,last_release
0,HTTPClient:HTTPClient,2005-08-01 09:17:55
1,abbot:abbot,2015-09-22 15:54:01
2,abbot:costello,2015-09-24 09:21:46
3,academy.alex:custommatcher,2018-05-31 18:57:25
4,academy.compose.companion:multi-fab,2021-01-14 06:40:25
...,...,...
634998,zone.src.sheaf:sheaf-parent,2023-08-21 09:29:38
634999,zone.src.sheaf:web-sheaf,2023-08-21 12:15:54
635000,zone.stefan.dev:geocode,2021-01-08 17:44:45
635001,zone.wmj:user-agent-util,2022-06-16 07:45:05


In [7]:
# Mark artifacts as unmaintained if the last release is older than the 2-year threshold
unmaintained_artifacts['unmaintained'] = unmaintained_artifacts['last_release'] < two_years_ago
unmaintained_artifacts

Unnamed: 0,artifact_id,last_release,unmaintained
0,HTTPClient:HTTPClient,2005-08-01 09:17:55,True
1,abbot:abbot,2015-09-22 15:54:01,True
2,abbot:costello,2015-09-24 09:21:46,True
3,academy.alex:custommatcher,2018-05-31 18:57:25,True
4,academy.compose.companion:multi-fab,2021-01-14 06:40:25,True
...,...,...,...
634998,zone.src.sheaf:sheaf-parent,2023-08-21 09:29:38,False
634999,zone.src.sheaf:web-sheaf,2023-08-21 12:15:54,False
635000,zone.stefan.dev:geocode,2021-01-08 17:44:45,True
635001,zone.wmj:user-agent-util,2022-06-16 07:45:05,True


In [8]:
# 2. Identify "Long-Time Inactive" Artifacts
# Sort data to calculate gaps between consecutive releases
artifact_releases = artifact_releases.sort_values(by=['artifact_id', 'release_timestamp'])
artifact_releases['prev_release'] = artifact_releases.groupby('artifact_id')['release_timestamp'].shift(1)


In [9]:
# Calculate gaps between consecutive releases
artifact_releases['gap_in_days'] = (artifact_releases['release_timestamp'] - artifact_releases['prev_release']).dt.days


In [10]:
# Set threshold for long-time inactivity (e.g., 1 year)
inactive_threshold = 365  # 1 year
artifact_releases['long_time_inactive'] = artifact_releases['gap_in_days'] > inactive_threshold


In [11]:
artifact_releases

Unnamed: 0,artifact_id,release_id,release_version,release_timestamp,prev_release,gap_in_days,long_time_inactive
13766825,HTTPClient:HTTPClient,HTTPClient:HTTPClient:0.3-3,0.3-3,2005-08-01 09:17:55,NaT,,False
13237448,abbot:abbot,abbot:abbot:0.12.3,0.12.3,2005-08-01 09:17:56,NaT,,False
13237447,abbot:abbot,abbot:abbot:0.13.0,0.13.0,2005-08-01 09:17:57,2005-08-01 09:17:56,0.0,False
13237446,abbot:abbot,abbot:abbot:1.4.0,1.4.0,2015-09-22 15:54:01,2005-08-01 09:17:57,3704.0,True
10918348,abbot:costello,abbot:costello:1.4.0,1.4.0,2015-09-24 09:21:46,NaT,,False
...,...,...,...,...,...,...,...
10399545,zone.wmj:user-agent-util,zone.wmj:user-agent-util:1.2.10,1.2.10,2022-04-23 16:16:18,2022-03-11 09:41:41,43.0,False
10399546,zone.wmj:user-agent-util,zone.wmj:user-agent-util:2.0.0,2.0.0,2022-06-16 07:45:05,2022-04-23 16:16:18,53.0,False
13413156,zw.co.paynow:java-sdk,zw.co.paynow:java-sdk:1.0.0,1.0.0,2019-01-26 13:26:11,NaT,,False
13413154,zw.co.paynow:java-sdk,zw.co.paynow:java-sdk:1.1.0,1.1.0,2019-03-27 08:12:50,2019-01-26 13:26:11,59.0,False


In [12]:
# Track if activity resumed after inactivity
artifact_releases['activity_resumed'] = artifact_releases.groupby('artifact_id')['long_time_inactive'].transform('any')


In [13]:
# Mark artifacts that had a long-time inactivity period but resumed
long_time_inactive_resumed = artifact_releases.groupby('artifact_id').apply(
    lambda x: x['long_time_inactive'].any() and not x['long_time_inactive'].iloc[-1]
).reset_index(name='resumed_after_inactivity')
long_time_inactive_resumed

  long_time_inactive_resumed = artifact_releases.groupby('artifact_id').apply(


Unnamed: 0,artifact_id,resumed_after_inactivity
0,HTTPClient:HTTPClient,False
1,abbot:abbot,False
2,abbot:costello,False
3,academy.alex:custommatcher,False
4,academy.compose.companion:multi-fab,False
...,...,...
634998,zone.src.sheaf:sheaf-parent,True
634999,zone.src.sheaf:web-sheaf,True
635000,zone.stefan.dev:geocode,False
635001,zone.wmj:user-agent-util,False


In [14]:
analytics_summary = pd.merge(
    unmaintained_artifacts[['artifact_id', 'unmaintained', 'last_release']],
    long_time_inactive_resumed[['artifact_id', 'resumed_after_inactivity']],
    on='artifact_id',
    how='left'
).fillna(False)
analytics_summary

Unnamed: 0,artifact_id,unmaintained,last_release,resumed_after_inactivity
0,HTTPClient:HTTPClient,True,2005-08-01 09:17:55,False
1,abbot:abbot,True,2015-09-22 15:54:01,False
2,abbot:costello,True,2015-09-24 09:21:46,False
3,academy.alex:custommatcher,True,2018-05-31 18:57:25,False
4,academy.compose.companion:multi-fab,True,2021-01-14 06:40:25,False
...,...,...,...,...
634998,zone.src.sheaf:sheaf-parent,False,2023-08-21 09:29:38,True
634999,zone.src.sheaf:web-sheaf,False,2023-08-21 12:15:54,True
635000,zone.stefan.dev:geocode,True,2021-01-08 17:44:45,False
635001,zone.wmj:user-agent-util,True,2022-06-16 07:45:05,False


In [15]:
print(analytics_summary[analytics_summary['unmaintained']])


                                artifact_id  unmaintained        last_release  \
0                     HTTPClient:HTTPClient          True 2005-08-01 09:17:55   
1                               abbot:abbot          True 2015-09-22 15:54:01   
2                            abbot:costello          True 2015-09-24 09:21:46   
3                academy.alex:custommatcher          True 2018-05-31 18:57:25   
4       academy.compose.companion:multi-fab          True 2021-01-14 06:40:25   
...                                     ...           ...                 ...   
634993            zone.refactor.text:writer          True 2020-01-04 19:26:01   
634994                   zone.refactor:text          True 2020-01-04 19:26:01   
635000              zone.stefan.dev:geocode          True 2021-01-08 17:44:45   
635001             zone.wmj:user-agent-util          True 2022-06-16 07:45:05   
635002                zw.co.paynow:java-sdk          True 2019-06-14 09:50:47   

        resumed_after_inact

In [16]:
print(analytics_summary[analytics_summary['resumed_after_inactivity']])


                             artifact_id  unmaintained        last_release  \
92      ae.teletronics.solr:solr-plugins          True 2018-05-23 06:05:39   
98                    aero.m-click:mcpdf         False 2023-01-28 01:42:16   
99                       aero.t2s:mode-s          True 2022-01-09 20:17:11   
192       ai.cardscan:insurance-cardscan         False 2024-05-17 22:53:51   
268                  ai.djl.android:core         False 2024-08-04 01:10:57   
...                                  ...           ...                 ...   
634995         zone.src.sheaf:java-sheaf         False 2023-08-21 12:09:42   
634996      zone.src.sheaf:logback-sheaf         False 2023-08-21 10:56:45   
634997     zone.src.sheaf:sheaf-deps-bom         False 2023-08-21 11:56:48   
634998       zone.src.sheaf:sheaf-parent         False 2023-08-21 09:29:38   
634999          zone.src.sheaf:web-sheaf         False 2023-08-21 12:15:54   

        resumed_after_inactivity  
92                          