In [None]:
import pandas as pd

# Load the dataset
file_path = 'all_issues_sampled_v2.xlsx'
df = pd.read_excel(file_path)

# Step 1: Create a subset of the dataset separating 'No miscommunication' from the rest
miscommunication_df = df[df['Types of miscommunication'] != 'No miscommunication']
miscommunication_df = miscommunication_df[miscommunication_df['Types of miscommunication'] != 'Others']


# Step 2: Perform a value count on the 'Timing' column for this subset
timing_counts = miscommunication_df['Timing'].value_counts()

# Calculate the percentage of each timing occurrence
timing_percentage = (timing_counts / timing_counts.sum()) * 100

# Create a DataFrame with the timing counts and their corresponding percentages
timing_df = pd.DataFrame({
    'Timing': timing_counts.index,
    'Count': timing_counts.values,
    'Percentage (%)': timing_percentage.values
})

# Exclude 'Duplicate issue' from the DataFrame and group by 'Types of miscommunication' and 'Timing'
non_duplicate_df = miscommunication_df[miscommunication_df['Timing'] != 'Others']
non_duplicate_grouped = non_duplicate_df.groupby(['Types of miscommunication', 'Timing']).size().reset_index(name='Count')

# Calculate the percentage for each type of miscommunication and timing
non_duplicate_grouped['Percentage (%)'] = (non_duplicate_grouped['Count'] / non_duplicate_grouped['Count'].sum()) * 100

# Pivot the table to get counts and percentages in a combined format
non_duplicate_pivot = non_duplicate_grouped.pivot(index='Types of miscommunication', columns='Timing', values='Count').fillna(0)
non_duplicate_pivot_percentage = non_duplicate_grouped.pivot(index='Types of miscommunication', columns='Timing', values='Percentage (%)').fillna(0)
combined_non_duplicate_df = non_duplicate_pivot.astype(int).astype(str) + " (" + non_duplicate_pivot_percentage.round(2).astype(str) + "%)"

# Recalculate counts and percentages within each type of miscommunication
grouped_by_type = non_duplicate_df.groupby(['Types of miscommunication', 'Timing']).size().unstack(fill_value=0)

# Calculate the total counts for each type of miscommunication
totals_by_type = grouped_by_type.sum(axis=1)

# Calculate the percentage for each timing occurrence within each type of miscommunication
percentages_within_type = grouped_by_type.div(totals_by_type, axis=0) * 100

# Combine counts and percentages into a single DataFrame
combined_within_type_df = grouped_by_type.astype(int).astype(str) + " (" + percentages_within_type.round(2).astype(str) + "%)"
combined_within_type_df

In [None]:
df['Timing'].value_counts()

In [None]:
timing_df

In [None]:
non_duplicate_df['Timing'].value_counts(normalize=True)

In [None]:
non_duplicate_df['Timing'].value_counts()

In [None]:
miscommunication_df

In [None]:
miscommunication_df['state'].value_counts()

In [None]:
miscommunication_df['Root cause'].value_counts()

In [None]:
miscommunication_df['Root cause'].value_counts(normalize=True)