In [None]:
import pandas as pd
import numpy as np

In [None]:
abandoned_df = pd.read_csv('abandoned_projects_with_labels.csv')

In [None]:
# Step 1: Convert 'release_timestamp' to datetime
abandoned_df['release_timestamp'] = pd.to_datetime(abandoned_df['release_timestamp'])

# Step 2: Create the release rhythm for each artifact (list of release timestamps)
rhythm_df = abandoned_df.groupby('artifact_id')['release_timestamp'].apply(list).reset_index()
rhythm_df.columns = ['artifact_id', 'release_rhythm']

# Step 3: Calculate the speed for each artifact
artifact_speed = abandoned_df.groupby('artifact_id').agg(
    release_count=('release_id', 'size'),
    first_release=('release_timestamp', 'min'),
    last_release=('release_timestamp', 'max')
).reset_index()

# Calculate the time interval (in months) between the first and last release
artifact_speed['months_interval'] = ((artifact_speed['last_release'].dt.year - artifact_speed['first_release'].dt.year) * 12 +
                                     (artifact_speed['last_release'].dt.month - artifact_speed['first_release'].dt.month))

# Avoid division by zero (when there's only one release, set the interval to 1 month)
artifact_speed['months_interval'] = artifact_speed['months_interval'].replace(0, 1)

# Calculate speed (releases per month)
artifact_speed['speed'] = artifact_speed['release_count'] / artifact_speed['months_interval']

# Step 4: Merge release rhythm and speed
df_with_features = pd.merge(rhythm_df, artifact_speed[['artifact_id', 'release_count', 'speed']], on='artifact_id')

# Display the final DataFrame
df_with_features


In [None]:
# Merge release rhythm and speed along with months_interval
df_with_features = pd.merge(
    rhythm_df, 
    artifact_speed[['artifact_id', 'release_count', 'speed', 'months_interval']], 
    on='artifact_id'
)

# Display the final DataFrame
df_with_features


In [None]:
df_with_features.describe()

In [None]:
# Calculate lifespan of projects in years
df_with_features['lifespan_years'] = df_with_features['months_interval'] / 12

# Bin speed into categories
speed_bins = [-np.inf, 1, 2, np.inf]
speed_labels = ['<1', '1-2', '>2']
df_with_features['speed_category'] = pd.cut(df_with_features['speed'], bins=speed_bins, labels=speed_labels)

# Bin lifespan into categories
lifespan_bins = [0, 1, 2, np.inf]
lifespan_labels = ['<1 year', '1-2 years', '>2 years']
df_with_features['lifespan_category'] = pd.cut(df_with_features['lifespan_years'], bins=lifespan_bins, labels=lifespan_labels)

# Create crosstab
crosstab = pd.crosstab(df_with_features['speed_category'], df_with_features['lifespan_category'],margins=True, margins_name='Total')
crosstab

In [None]:
# Create crosstab with totals
crosstab_with_totals = pd.crosstab(
    df_with_features['speed_category'], 
    df_with_features['lifespan_category'],
    margins=True,  # Add totals row and column
    margins_name="Total"
)

# Calculate percentage of each cell with respect to the grand total
crosstab_percent = (crosstab_with_totals / crosstab_with_totals.loc["Total", "Total"]) * 100
crosstab_percent

#abandoend

In [None]:
df_with_features