# Descriptive Analysis: CUDA Error Resolution on PyTorch Forum

This notebook performs descriptive statistical analysis on ~80,000 PyTorch forum topics.

## Goals
1. Understand data distributions
2. Compare CUDA vs non-CUDA questions
3. Identify correlations
4. Test category effects (ANOVA)

## Setup

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sys
sys.path.append('..')

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

print("Setup complete!")

Setup complete!


## Load Data

In [2]:
# Load processed data
df = pd.read_csv('../data/processed/forum_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/forum_data.csv'

## 1. Summary Statistics

In [None]:
# Overall statistics
print("=== OVERALL STATISTICS ===\n")
print(f"Total topics: {len(df):,}")
print(f"CUDA-related: {df['is_cuda_related'].sum():,} ({df['is_cuda_related'].mean()*100:.1f}%)")
print(f"Resolved topics: {df['is_resolved'].sum():,} ({df['is_resolved'].mean()*100:.1f}%)")
print(f"Topics with code: {df['has_code_block'].sum():,} ({df['has_code_block'].mean()*100:.1f}%)")

# Basic statistics for numerical columns
print("\n=== NUMERICAL FEATURES ===")
df[['views', 'reply_count', 'like_count', 'question_length', 'time_to_resolution_hours']].describe()

### Comparison: CUDA vs Non-CUDA

In [None]:
# Compare CUDA vs non-CUDA questions
resolved_df = df[df['time_to_resolution_hours'].notna()].copy()

comparison = resolved_df.groupby('is_cuda_related').agg({
    'time_to_resolution_hours': ['mean', 'median', 'std', 'count'],
    'time_to_first_response_hours': ['mean', 'median'],
    'is_resolved': 'mean',
    'views': 'mean',
    'reply_count': 'mean',
    'has_code_block': 'mean',
    'question_length': 'mean'
}).round(2)

comparison.index = ['Non-CUDA', 'CUDA']
print(comparison)

## 2. Correlation Analysis

In [None]:
# Select numerical features
numerical_cols = [
    'is_cuda_related', 'views', 'reply_count', 'like_count',
    'has_code_block', 'code_block_count', 'question_length', 'has_error_trace',
    'time_to_resolution_hours', 'time_to_first_response_hours'
]

# Compute correlation matrix
corr_matrix = df[numerical_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTop correlations with resolution time:")
print(corr_matrix['time_to_resolution_hours'].sort_values(ascending=False))

## 3. ANOVA: Category Effect

In [None]:
# Test if category significantly affects resolution time
resolved = df[df['time_to_resolution_hours'].notna()].copy()

# Group by category
categories = resolved.groupby('category_id')['time_to_resolution_hours'].apply(list)

# Filter categories with enough samples
categories = {k: v for k, v in categories.items() if len(v) >= 5}

print(f"Testing {len(categories)} categories with sufficient data")

# Perform ANOVA
f_stat, p_value = stats.f_oneway(*categories.values())

print(f"\n=== ANOVA RESULTS ===")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.6f}")

if p_value < 0.05:
    print("\n✓ Category significantly affects resolution time (p < 0.05)")
    print("Conclusion: We need to control for category in causal analysis")
else:
    print("\n✗ No significant category effect (p >= 0.05)")

## 4. Visualizations

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Resolution time distribution
resolved['time_to_resolution_hours'].hist(bins=50, edgecolor='black', alpha=0.7, ax=axes[0, 0])
axes[0, 0].axvline(resolved['time_to_resolution_hours'].median(), color='red', 
                    linestyle='--', linewidth=2, label=f"Median: {resolved['time_to_resolution_hours'].median():.1f}h")
axes[0, 0].set_xlabel('Time to Resolution (hours)', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Distribution of Resolution Times', fontsize=14, fontweight='bold')
axes[0, 0].legend()

# 2. CUDA vs Non-CUDA boxplot
cuda_resolved = resolved[resolved['is_cuda_related']]
non_cuda_resolved = resolved[~resolved['is_cuda_related']]

axes[0, 1].boxplot([non_cuda_resolved['time_to_resolution_hours'], 
                     cuda_resolved['time_to_resolution_hours']], 
                    labels=['Non-CUDA', 'CUDA'])
axes[0, 1].set_ylabel('Time to Resolution (hours)', fontsize=12)
axes[0, 1].set_title('Resolution Time: CUDA vs Non-CUDA', fontsize=14, fontweight='bold')
axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Views distribution (log scale)
axes[1, 0].hist(df['views'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Views', fontsize=12)
axes[1, 0].set_ylabel('Frequency', fontsize=12)
axes[1, 0].set_title('Distribution of Views', fontsize=14, fontweight='bold')
axes[1, 0].set_yscale('log')

# 4. Resolution rate comparison
resolution_rates = df.groupby('is_cuda_related')['is_resolved'].mean()
bars = axes[1, 1].bar(['Non-CUDA', 'CUDA'], resolution_rates.values, 
                       color=['skyblue', 'coral'], edgecolor='black', linewidth=1.5)
axes[1, 1].set_ylabel('Resolution Rate', fontsize=12)
axes[1, 1].set_title('Resolution Rate: CUDA vs Non-CUDA', fontsize=14, fontweight='bold')
axes[1, 1].set_ylim(0, 1)

# Add percentages on bars
for bar, val in zip(bars, resolution_rates.values):
    height = bar.get_height()
    axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.02,
                     f'{val*100:.1f}%', ha='center', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: Views vs Resolution Time
plt.figure(figsize=(12, 7))
colors = resolved['is_cuda_related'].map({True: 'coral', False: 'skyblue'})
plt.scatter(resolved['views'], resolved['time_to_resolution_hours'], 
            c=colors, alpha=0.4, edgecolors='black', linewidth=0.3, s=30)
plt.xlabel('Views (log scale)', fontsize=12)
plt.ylabel('Time to Resolution (hours)', fontsize=12)
plt.title('Views vs Resolution Time', fontsize=14, fontweight='bold')
plt.xscale('log')

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='skyblue', edgecolor='black', label='Non-CUDA'),
    Patch(facecolor='coral', edgecolor='black', label='CUDA')
]
plt.legend(handles=legend_elements, fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## Key Findings

1. **CUDA Prevalence**: X% of questions are CUDA-related
2. **Resolution Time**: CUDA questions take Y hours longer on average (naive comparison)
3. **Confounders Identified**:
   - CUDA posts get more views (complexity indicator)
   - Category matters (ANOVA: p < 0.05)
   - Code presence correlates with faster resolution
4. **Next Steps**: Need causal analysis to disentangle correlation from causation