In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from foundry.transforms import Dataset
from scipy import stats

In [None]:
base_df = Dataset.get("training_facts_cap_3").read_table(format="pandas")

print(base_df.shape)
print(base_df.info())

In [None]:
# Add target to base_df for analysis & plotting
base_df['noShowPredDay1'] = base_df.groupby('maskedMatchId')['noShow'].shift(-1)

In [None]:
last_rows = base_df.groupby('maskedMatchId').tail(1)
print(f"Number of employees: {len(last_rows)}")
print(f"No-shows in last rows:\n{last_rows['noShow'].value_counts()}")

In [None]:
for col in base_df.select_dtypes(include=['object']).columns:
    print(f"{col} vs noShowPredDay1:")
    print(base_df.groupby(col)['noShowPredDay1'].agg(['mean', 'count']))

In [None]:
# Calculate correlations between numeric features and target
correlations = base_df.select_dtypes(include=["number"]).corr()['noShowPredDay1']

correlations = correlations.drop('noShowPredDay1')
positive_corr = correlations[correlations > 0].sort_values(ascending=False)
negative_corr = correlations[correlations < 0].sort_values(ascending=True)

positive_df = positive_corr.to_frame(name='correlation')
negative_df = negative_corr.to_frame(name='correlation')

print(f"Positive correlations: {positive_df}")
print(f"Negative correlations: {negative_df}")

In [None]:
# Plot positive & negative correlations in base DF
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot positive correlations
positive_corr.plot(kind='barh', ax=axes[0], color='cornflowerblue')
axes[0].set_title('Positive Correlations with noShowPredDay1', fontsize=14)
axes[0].set_xlabel('Correlation', fontsize=12)

# Plot negative correlations  
negative_corr.plot(kind='barh', ax=axes[1], color='lightsteelblue')
axes[1].set_title('Negative Correlations with noShowPredDay1', fontsize=14)
axes[1].set_xlabel('Correlation', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Check for multicollinearity
print(base_df[['yearsOfSeniority', 'daysOfSeniority']].corr())

In [None]:
# Find highly correlated pairs
corr_matrix = base_df.select_dtypes(include=['number']).corr().abs()

# Get upper triangle of the correlation matrix
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# Find pairs with correlation > 0.95
high_corr_pairs = [
    (column, row, corr_matrix.loc[row, column]) 
    for column in upper_triangle.columns 
    for row in upper_triangle.index 
    if upper_triangle.loc[row, column] > 0.95
]

for col1, col2, corr in high_corr_pairs:
    print(f"{col1} <-> {col2}: {corr:.4f}")

In [None]:
# Plot location vs noshow rates
cat_col = 'locationId'
# Calculate no-show rate by category
noshow_rates = base_df.groupby(cat_col)['noShowPredDay1'].agg(['mean', 'count'])
noshow_rates['ratePct'] = noshow_rates['mean'] * 100
print(noshow_rates)

fig, ax = plt.subplots(figsize=(10, 6))
noshow_rates['ratePct'].plot(kind='bar', ax=ax, color='cornflowerblue')
plt.title(f'No-Show Rate by {cat_col}')
plt.ylabel('No-Show Rate (%)')
plt.xlabel(cat_col)
plt.xticks(rotation=45)

# Set y-axis to start at 0 with appropriate max
ax.set_ylim(0, noshow_rates['ratePct'].max() * 1.15)  # 15% padding for labels

# Add row count labels on bars
for i, (idx, row) in enumerate(noshow_rates.iterrows()):
    ax.text(i, row['ratePct'] + 0.05, f"rows={row['count']:,.0f}", 
            ha='center', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
# Plot dept vs noshow rates
cat_col = 'department'
# Calculate no-show rate by category
noshow_rates = base_df.groupby(cat_col)['noShowPredDay1'].agg(['mean', 'count'])
noshow_rates['ratePct'] = noshow_rates['mean'] * 100

# Get top 10 by no-show rate
top_10 = noshow_rates.nlargest(10, 'ratePct')

print(top_10)

fig, ax = plt.subplots(figsize=(12, 6))
top_10['ratePct'].plot(kind='bar', ax=ax, color='cornflowerblue')
plt.title('Top 10 Departments by No-Show Rate')
plt.ylabel('No-Show Rate (%)')
plt.xlabel(cat_col)
plt.xticks(rotation=45, ha='right')

# y-axis set to start at 0 with appropriate max
ax.set_ylim(0, top_10['ratePct'].max() * 1.15)  # 15% padding for labels

# Add row count labels on bars
for i, (idx, row) in enumerate(top_10.iterrows()):
    ax.text(i, row['ratePct'] + 0.05, f"rows={row['count']:,.0f}", 
            ha='center', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Plot no-shows by week with dates stopped at the current date
current_date = pd.to_datetime('today').normalize()
base_df['date'] = pd.to_datetime(base_df['date'])
df_current = base_df[base_df['date'] <= current_date]

# Index and resample
df_indexed = df_current.set_index('date')
weekly_noshows = df_indexed['noShowPredDay1'].resample('W').sum()


# Plot weekly no-show rates
top_n = 5  # Number of peaks to annotate
top_weeks = weekly_noshows.nlargest(top_n)

plt.figure(figsize=(14, 6))
weekly_noshows.plot(kind='line', color='cornflowerblue', linewidth=2)

# Annotate peaks
for date, count in top_weeks.items():
    plt.annotate(f'{int(count)}\n{date.strftime("%Y-%m-%d")}',
                 xy=(date, count),
                 xytext=(0, 10),  # offset text 10 points above
                 textcoords='offset points',
                 ha='center',
                 fontsize=9,
                 bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.7),
                 arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

plt.title('Weekly No-Show Counts')
plt.xlabel('Week')
plt.ylabel('Number of No-Shows')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot shift vs noshow rates
cat_col = 'shift'
# Calculate no-show rate by category
noshow_rates = base_df.groupby(cat_col)['noShowPredDay1'].agg(['mean', 'count'])
noshow_rates['ratePct'] = noshow_rates['mean'] * 100
print(noshow_rates)

fig, ax = plt.subplots(figsize=(10, 6))
noshow_rates['ratePct'].plot(kind='bar', ax=ax, color='cornflowerblue')
plt.title(f'No-Show Rate by {cat_col}')
plt.ylabel('No-Show Rate (%)')
plt.xlabel(cat_col)
plt.xticks(rotation=45)

# Set y-axis to start at 0 with appropriate max
ax.set_ylim(0, noshow_rates['ratePct'].max() * 1.15)  # 15% padding for labels

# Add row count labels on bars
for i, (idx, row) in enumerate(noshow_rates.iterrows()):
    ax.text(i, row['ratePct'] + 0.05, f"rows={row['count']:,.0f}", 
            ha='center', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
final_df = Dataset.get("training_df").read_table(format="pandas")

print(final_df.shape)
print(final_df.info())

object_cols = final_df.select_dtypes(include=['object']).columns.tolist()
print(object_cols)

for col in final_df.select_dtypes(include=['object']).columns:
    print(f"{col} vs noShow_day1_target:")
    print(final_df.groupby(col)['noShow_day1_target'].agg(['mean', 'count']))

In [None]:
# Calculate correlations between numeric features and target
correlations = final_df.select_dtypes(include=["number"]).corr()['noShow_day1_target']

correlations = correlations.drop('noShow_day1_target')
positive_corr = correlations[correlations > 0].sort_values(ascending=False)
negative_corr = correlations[correlations < 0].sort_values(ascending=True)

positive_df = positive_corr.to_frame(name='correlation')
negative_df = negative_corr.to_frame(name='correlation')

print(f"Positive correlations: {positive_df}")
print(f"Negative correlations: {negative_df}")

In [None]:
# Get top 10 strongest positive / negative correlations
top_25_positive = positive_corr.nlargest(25)
top_25_negative = negative_corr.nsmallest(25) 

# Plot positive & negative correlations
fig, axes = plt.subplots(1, 2, figsize=(14, 8))

# Plot positive correlations
top_25_positive.plot(kind='barh', ax=axes[0], color='cornflowerblue')
axes[0].set_title('Top 25 Positive Correlations with noShow_day1_target', fontsize=14)
axes[0].set_xlabel('Correlation', fontsize=12)

# Plot negative correlations  
top_25_negative.plot(kind='barh', ax=axes[1], color='lightsteelblue')
axes[1].set_title('Top 25 Negative Correlations with noShow_day1_target', fontsize=14)
axes[1].set_xlabel('Correlation', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated pairs
corr_matrix = final_df.select_dtypes(include=['number']).corr().abs()

# Get upper triangle of the correlation matrix
upper_triangle = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# Find pairs with correlation > 0.95
high_corr_pairs = [
    (column, row, corr_matrix.loc[row, column]) 
    for column in upper_triangle.columns 
    for row in upper_triangle.index 
    if upper_triangle.loc[row, column] > 0.95
]

# Sort by correlation strength
high_corr_pairs_sorted = sorted(high_corr_pairs, key=lambda x: x[2], reverse=True)

for col1, col2, corr in high_corr_pairs_sorted:
    print(f"{col1} <-> {col2}: {corr:.4f}")

In [None]:
# Compare feature distribution for show vs no-show
feature = 'noShowCount_7d'

plt.figure(figsize=(10, 6))
plt.hist(final_df[final_df['noShow_day1_target']==0][feature], 
         bins=50, alpha=0.5, label='Show (0)', density=True, color='lightblue')
plt.hist(final_df[final_df['noShow_day1_target']==1][feature], 
         bins=50, alpha=0.5, label='No-Show (1)', density=True, color='red')
plt.xlabel(feature)
plt.ylabel('Density')
plt.title(f'{feature} Distribution by No-Show Status')
plt.legend()
plt.show()

In [None]:
# Compare feature distribution for show vs no-show
feature = 'daysSinceLastNoShow'

plt.figure(figsize=(10, 6))
plt.hist(final_df[final_df['noShow_day1_target']==0][feature], 
         bins=50, alpha=0.5, label='Show (0)', density=True, color='lightblue')
plt.hist(final_df[final_df['noShow_day1_target']==1][feature], 
         bins=50, alpha=0.5, label='No-Show (1)', density=True, color='red')
plt.xlabel(feature)
plt.ylabel('Density')
plt.title(f'{feature} Distribution by No-Show Status')
plt.legend()
plt.show()

In [None]:
# Test for significance of distribution difference
feature = 'daysSinceLastNoShow'

# Separate the groups
show_group = final_df[final_df['noShow_day1_target']==0][feature]
noshow_group = final_df[final_df['noShow_day1_target']==1][feature]

# Descriptive statistics
print("=== Descriptive Statistics ===")
print(f"\nShow Group (n={len(show_group)}):")
print(f"  Mean: {show_group.mean():.2f}")
print(f"  Median: {show_group.median():.2f}")
print(f"  Std Dev: {show_group.std():.2f}")

print(f"\nNo-Show Group (n={len(noshow_group)}):")
print(f"  Mean: {noshow_group.mean():.2f}")
print(f"  Median: {noshow_group.median():.2f}")
print(f"  Std Dev: {noshow_group.std():.2f}")

# Analyze bimodal peaks for no-shows
print("\n=== Bimodal Analysis for No-Shows ===")
recent_repeaters = noshow_group[noshow_group <= 100]  # Recent peak
dormant_reactivators = noshow_group[noshow_group >= 500]  # Dormant peak

print(f"Recent repeaters (≤100 days): {len(recent_repeaters)} ({len(recent_repeaters)/len(noshow_group)*100:.1f}%)")
print(f"  Mean: {recent_repeaters.mean():.2f}")
print(f"Dormant reactivators (≥500 days): {len(dormant_reactivators)} ({len(dormant_reactivators)/len(noshow_group)*100:.1f}%)")
print(f"  Mean: {dormant_reactivators.mean():.2f}")

# Statistical tests
print("\n=== Statistical Tests ===")

# Mann-Whitney U test - non-parametric, good for non-normal distributions
statistic, p_value = stats.mannwhitneyu(show_group, noshow_group, alternative='two-sided')
print("\nMann-Whitney U Test:")
print(f"  U-statistic: {statistic:.2f}")
print(f"  p-value: {p_value:.2e}")

# Kolmogorov-Smirnov test - tests if distributions are different
ks_stat, ks_pvalue = stats.ks_2samp(show_group, noshow_group)
print("\nKolmogorov-Smirnov Test:")
print(f"  KS-statistic: {ks_stat:.4f}")
print(f"  p-value: {ks_pvalue:.2e}")

In [None]:
# Compare feature distribution for show vs no-show
feature = 'noShowAfterHolidayRate'

plt.figure(figsize=(10, 6))
plt.hist(final_df[final_df['noShow_day1_target']==0][feature], 
         bins=50, alpha=0.5, label='Show (0)', density=True, color='lightblue')
plt.hist(final_df[final_df['noShow_day1_target']==1][feature], 
         bins=50, alpha=0.5, label='No-Show (1)', density=True, color='red')
plt.xlabel(feature)
plt.ylabel('Density')
plt.title(f'{feature} Distribution by No-Show Status')
plt.legend()
plt.show()

In [None]:
# Compare feature distribution for show vs no-show
feature = 'manager_team_noshow_rate_7d'

plt.figure(figsize=(10, 6))
plt.hist(final_df[final_df['noShow_day1_target']==0][feature], 
         bins=50, alpha=0.5, label='Show (0)', density=True, color='lightblue')
plt.hist(final_df[final_df['noShow_day1_target']==1][feature], 
         bins=50, alpha=0.5, label='No-Show (1)', density=True, color='red')
plt.xlabel(feature)
plt.ylabel('Density')
plt.title(f'{feature} Distribution by No-Show Status')
plt.legend()
plt.show()