# Survey Exploratory Data Analysis Report

This notebook performs an automated EDA on the `umfrage.xlsx` file containing survey responses from dental practice staff. All outputs are generated automatically via GitHub Actions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load and Inspect Survey Data

In [None]:
# Load the survey data
df = pd.read_excel('umfrage.xlsx')

print(f"Survey Dataset Shape: {df.shape[0]} responses and {df.shape[1]} columns")
print("\nFirst 2 responses (basic info only):")
display(df[['ID', 'Name', 'Startzeit', 'Fertigstellungszeit']].head(2))

## 2. Data Quality Overview

In [None]:
print("Data Types Overview:")
print(f"Total columns: {len(df.columns)}")
print(f"Text columns: {df.select_dtypes(include=['object']).shape[1]}")
print(f"Numeric columns: {df.select_dtypes(include=['int64', 'float64']).shape[1]}")
print(f"DateTime columns: {df.select_dtypes(include=['datetime64']).shape[1]}")

In [None]:
# Check for missing values in key columns
key_columns = ['ID', 'Name', 'Startzeit', 'Fertigstellungszeit']
missing_key = df[key_columns].isnull().sum()
print("Missing values in key columns:")
for col, missing in missing_key.items():
    print(f"{col}: {missing} ({missing/len(df)*100:.1f}%)")

In [None]:
# Overall completion rate
total_possible_answers = df.shape[0] * df.shape[1]
total_missing = df.isnull().sum().sum()
completion_rate = (total_possible_answers - total_missing) / total_possible_answers * 100
print(f"Overall Survey Completion Rate: {completion_rate:.1f}%")
print(f"Total missing values: {total_missing} out of {total_possible_answers} possible answers")

## 3. Response Analysis

### Survey Timeline

In [None]:
# Convert time columns to datetime if they're not already
if 'Startzeit' in df.columns:
    df['Startzeit'] = pd.to_datetime(df['Startzeit'], errors='coerce')
if 'Fertigstellungszeit' in df.columns:
    df['Fertigstellungszeit'] = pd.to_datetime(df['Fertigstellungszeit'], errors='coerce')

# Calculate completion time
if 'Startzeit' in df.columns and 'Fertigstellungszeit' in df.columns:
    df['completion_duration'] = (df['Fertigstellungszeit'] - df['Startzeit']).dt.total_seconds() / 60
    
    print("Survey Completion Statistics:")
    print(f"Average completion time: {df['completion_duration'].mean():.1f} minutes")
    print(f"Median completion time: {df['completion_duration'].median():.1f} minutes")
    print(f"Fastest completion: {df['completion_duration'].min():.1f} minutes")
    print(f"Slowest completion: {df['completion_duration'].max():.1f} minutes")

### Professional Background Analysis

In [None]:
# Find role/position column (it has a long German name)
role_col = None
for col in df.columns:
    if 'Rolle in der Praxis' in col:
        role_col = col
        break

if role_col:
    print("Roles mentioned in responses:")
    roles = df[role_col].dropna()
    for i, role in enumerate(roles, 1):
        print(f"Response {i}: {role[:100]}{'...' if len(role) > 100 else ''}")
else:
    print("Role column not found")

### Experience Analysis

In [None]:
# Find experience column
experience_col = None
for col in df.columns:
    if 'Wie lange sind Sie' in col:
        experience_col = col
        break

if experience_col:
    print("Experience levels:")
    experiences = df[experience_col].dropna()
    for i, exp in enumerate(experiences, 1):
        print(f"Response {i}: {exp}")
else:
    print("Experience column not found")

### Communication Channels Analysis

In [None]:
# Find communication channels column
comm_col = None
for col in df.columns:
    if 'Über welche Kanäle kontaktieren' in col:
        comm_col = col
        break

if comm_col:
    print("Communication channels used:")
    channels = df[comm_col].dropna()
    for i, channel in enumerate(channels, 1):
        print(f"Response {i}: {channel}")
else:
    print("Communication channels column not found")

### Software Usage Analysis

In [None]:
# Find software column
software_col = None
for col in df.columns:
    if 'Praxisverwaltungssoftware' in col:
        software_col = col
        break

if software_col:
    print("Practice Management Software used:")
    software = df[software_col].dropna()
    for i, sw in enumerate(software, 1):
        print(f"Response {i}: {sw}")
        
    # Simple analysis of mentioned software
    software_mentions = {}
    for sw in software:
        sw_lower = sw.lower()
        if 'zawin' in sw_lower:
            software_mentions['ZaWin'] = software_mentions.get('ZaWin', 0) + 1
        elif 'charly' in sw_lower:
            software_mentions['Charly'] = software_mentions.get('Charly', 0) + 1
        elif 'evident' in sw_lower:
            software_mentions['Evident'] = software_mentions.get('Evident', 0) + 1
    
    if software_mentions:
        print("\nSoftware usage summary:")
        for sw, count in software_mentions.items():
            print(f"{sw}: {count} mention(s)")
else:
    print("Software column not found")

## 4. Key Challenges and Pain Points

In [None]:
# Find the main challenges/pain points columns
challenge_cols = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['störfaktoren', 'herausforderungen', 'mühsam', 'zeitaufwendig']):
        challenge_cols.append(col)

print(f"Found {len(challenge_cols)} columns related to challenges:")
for i, col in enumerate(challenge_cols, 1):
    print(f"{i}. {col[:80]}{'...' if len(col) > 80 else ''}")
    
    # Show responses for this challenge
    responses = df[col].dropna()
    if len(responses) > 0:
        print("   Responses:")
        for j, response in enumerate(responses, 1):
            print(f"   Response {j}: {response[:150]}{'...' if len(response) > 150 else ''}")
    print()

## 5. Digital Transformation Insights

In [None]:
# Find digital transformation related columns
digital_cols = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['digital', 'online', 'automatisch', 'ki', 'assistent']):
        digital_cols.append(col)

print(f"Found {len(digital_cols)} columns related to digital transformation:")
for i, col in enumerate(digital_cols, 1):
    print(f"{i}. {col[:80]}{'...' if len(col) > 80 else ''}")
    
    # Show responses
    responses = df[col].dropna()
    if len(responses) > 0:
        print("   Responses:")
        for j, response in enumerate(responses, 1):
            print(f"   Response {j}: {response[:150]}{'...' if len(response) > 150 else ''}")
    print()