In [None]:
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [None]:
data_ab = pd.read_csv('active_projects_with_labels.csv')

In [None]:
# Filter artifacts with more than 4 releases
data = data_ab.groupby('artifact_id').filter(lambda x: len(x) > 4)

data['release_timestamp'] = pd.to_datetime(data['release_timestamp'])


In [None]:
data

In [None]:
data['artifact_id'].nunique()

In [None]:
# Sort data by artifact_id and release_timestamp
data = data.sort_values(by=['artifact_id', 'release_timestamp'])

# Calculate time intervals in days
data['time_gap'] = data.groupby('artifact_id')['release_timestamp'].diff().dt.total_seconds() / (60 * 60 * 24)


In [None]:
data

In [None]:

def analyze_artifact(group):
    # Calculate project start and end timestamps
    project_start = group['release_timestamp'].min()
    project_end = group['release_timestamp'].max()

    # Calculate total duration in days
    total_duration = (project_end - project_start).total_seconds() / (60 * 60 * 24)

    # Calculate cumulative progress as a fraction of total duration
    group['cumulative_progress'] = (
        (group['release_timestamp'] - project_start).dt.total_seconds() / (60 * 60 * 24)
    ) / total_duration

    # Assign quartiles based on cumulative progress
    group['Quartile'] = pd.cut(
        group['cumulative_progress'],
        bins=[0, 0.25, 0.5, 0.75, 1],
        labels=['Q1', 'Q2', 'Q3', 'Q4'],
        include_lowest=True
    )

    # Calculate time intervals to the next release
    group['time_to_next_release'] = group['release_timestamp'].shift(-1) - group['release_timestamp']
    group['time_to_next_release'] = group['time_to_next_release'].dt.total_seconds() / (60 * 60 * 24)

    # Replace NaN (last release) with 0
    # group['time_to_next_release'] = group[group['time_to_next_release']].notna()
    # print(group)
    group = group[group['time_to_next_release'].notna()]

    # Calculate average time intervals within each quartile
    average_intervals = group.groupby('Quartile')['time_to_next_release'].mean()

    # Calculate mean and standard deviation of time intervals
    mean_interval = group['time_to_next_release'].mean()
    std_dev_interval = group['time_to_next_release'].std()

    # Define thresholds for "Fast," "Normal," and "Slow"
    # fast_threshold = mean_interval - 2 * std_dev_interval
    # slow_threshold = mean_interval + 2 * std_dev_interval
    fast_threshold = 0.8 * mean_interval
    slow_threshold = 1.2 * mean_interval

    # Assign "Fast," "Normal," or "Slow" labels based on thresholds
    quartile_labels = average_intervals.apply(
        lambda x: 'Fast' if x < fast_threshold else 
                  ('Slow' if x > slow_threshold else 'Normal')
    )

    # Map quartile labels back to group
    group['Quartile_Label'] = group['Quartile'].map(quartile_labels.to_dict())

    return group


In [None]:
from pandarallel import pandarallel
from tqdm import tqdm


# Initialize pandarallel
pandarallel.initialize(progress_bar=True, verbose=1)  # Enable progress bar and verbosity

# Apply the function in parallel
result = data.groupby('artifact_id').parallel_apply(analyze_artifact)


In [None]:
# Reset the index to make the DataFrame clean
result.reset_index(drop=True, inplace=True)

# Select relevant columns for final output
final_df = result[['artifact_id', 'release_timestamp', 'cumulative_progress', 'Quartile', 'time_to_next_release', 'Quartile_Label']]
final_df

In [None]:
# Group by artifact_id and Quartile to calculate average time intervals and labels

quartile_summary = result.groupby(['artifact_id', 'Quartile']).agg(
    avg_time_interval=('time_to_next_release', 'mean'),
    quartile_label=('Quartile_Label', 'first')  # Assuming consistency within a quartile
).reset_index()


In [None]:
# Pivot the DataFrame so that each Quartile becomes a column
pivoted_df = quartile_summary.pivot(
    index='artifact_id',
    columns='Quartile',
    values=['avg_time_interval', 'quartile_label']
)

# Flatten MultiIndex columns for better readability
pivoted_df.columns = [f"{stat}_{quartile}" for stat, quartile in pivoted_df.columns]

# Reset index for a clean DataFrame
pivoted_df.reset_index(inplace=True)


In [None]:
pivoted_df

In [None]:
# Create a new column summarizing the quartile labels in order
pivoted_df['quartile_label_summary'] = pivoted_df.apply(
    lambda row: f"Q1: {row['quartile_label_Q1']} > Q2: {row['quartile_label_Q2']} > Q3: {row['quartile_label_Q3']} > Q4: {row['quartile_label_Q4']}",
    axis=1
)
pivoted_df


In [None]:
# Create a new column summarizing the quartile labels in order
pivoted_df['quartile_label_summary'] = pivoted_df.apply(
    lambda row: f"{row['quartile_label_Q1']} > {row['quartile_label_Q2']} > {row['quartile_label_Q3']} > {row['quartile_label_Q4']}",
    axis=1
)
pivoted_df

In [None]:
pivoted_df['quartile_label_summary'].value_counts(normalize=True)

In [None]:
top_10 = pivoted_df['quartile_label_summary'].value_counts(normalize=True).head(10)
print(top_10)

stopped here

In [None]:
top_10 = pivoted_df['quartile_label_summary'].value_counts(normalize=False).head(10)
top_10_normalized = pivoted_df['quartile_label_summary'].value_counts(normalize=True).head(10) * 100

# Combine both counts and proportions into a single DataFrame
top_10_combined = pd.DataFrame({
    'Count': top_10,
    'Proportion': top_10_normalized
})

print(top_10_combined)

In [None]:
# Extract the top 10 patterns
top_10_patterns = top_10_combined.index

# Filter the pivoted DataFrame for the top 10 patterns
top_10_df = pivoted_df[pivoted_df['quartile_label_summary'].isin(top_10_patterns)]

# Calculate the release count per artifact
release_count_stats = data.groupby('artifact_id').size().reset_index(name='release_count')

# Merge the release count with the filtered top 10 patterns
top_10_with_counts = top_10_df.merge(release_count_stats, on='artifact_id', how='left')

# Group by quartile_label_summary and calculate the min and max release counts
top_10_release_count_range = top_10_with_counts.groupby('quartile_label_summary')['release_count'].agg(['min', 'max']).reset_index()

# Merge the range with the original top_10_combined DataFrame for a full summary
top_10_combined_with_range = top_10_combined.merge(
    top_10_release_count_range, 
    left_index=True, 
    right_on='quartile_label_summary'
)

# Rename columns for clarity
top_10_combined_with_range.rename(columns={'min': 'Min Release Count', 'max': 'Max Release Count'}, inplace=True)

# Display the final result
top_10_combined_with_range


In [None]:
# Group by quartile_label_summary and calculate min, max, mean, and median release counts
top_10_release_count_stats = top_10_with_counts.groupby('quartile_label_summary')['release_count'].agg(['min', 'mean', 'median','max']).reset_index()

# Merge the stats with the original top_10_combined DataFrame for a full summary
top_10_combined_with_stats = top_10_combined.merge(
    top_10_release_count_stats, 
    left_index=True, 
    right_on='quartile_label_summary'
)

# Rename columns for clarity
top_10_combined_with_stats.rename(
    columns={
        'min': 'Min Release Count', 
        'mean': 'Mean Release Count',
        'median': 'Median Release Count',
        'max': 'Max Release Count',
    }, 
    inplace=True
)

# Display the final result
top_10_combined_with_stats


In [None]:
# pivoted_df['quartile_label_summary'].value_counts(normalize=True)

In [None]:
# Calculate absolute counts
counts = pivoted_df['quartile_label_summary'].value_counts()

# Calculate percentages
percentages = pivoted_df['quartile_label_summary'].value_counts(normalize=True) * 100

# Combine counts and percentages into a single DataFrame
summary = pd.DataFrame({
    'Count': counts,
    'Percentage (%)': percentages
})

# Display the result
summary

#active projects

In [None]:
# Define the desired quartile label pattern
desired_pattern = "Normal > Normal > nan > Normal"

# Filter the DataFrame based on the quartile_label_summary column
matching_artifacts = pivoted_df[pivoted_df['quartile_label_summary'] == desired_pattern]
print(matching_artifacts[['artifact_id', 'quartile_label_summary']])
