In [2]:
# Simplified Pandas and NumPy Analysis
# ====================================

## 1. Setup and Imports

import numpy as np  # NumPy for numerical operations and array manipulation
import pandas as pd  # Pandas for data manipulation and analysis

print("Libraries imported successfully!")



Libraries imported successfully!


In [3]:

## 2. Creating Sample Survey Data

# Let's create a synthetic dataset
np.random.seed(42)  # For reproducibility

# Create sample survey responses
responses = [
    "The service at this clinic was excellent. Staff was very responsive and caring.",
    "Wait times were too long. I had to sit for over an hour before seeing a doctor.",
    "Doctors were knowledgeable but the facility needs updating.",
    "I appreciate how the staff explained everything clearly to me.",
    "The parking situation is terrible. Had to walk far with my injured leg.",
    "Very clean facility and professional staff. Would recommend to others.",
    "Communication could be improved. I wasn't notified about my appointment change.",
    "The children's play area was a nice touch while waiting for our appointment.",
    "Billing department made multiple errors and was difficult to reach.",
    "The new patient portal is confusing and hard to navigate."
]

# Create demographic and metadata
demographics = []
for i in range(len(responses)):
    age_group = np.random.choice(['18-24', '25-34', '35-44', '45-54', '55-64', '65+'])
    gender = np.random.choice(['Female', 'Male', 'Other', 'Prefer not to say'])
    visit_type = np.random.choice(['Primary Care', 'Specialist', 'Emergency', 'Routine Checkup'])
    satisfaction = np.random.randint(1, 11)  # 1-10 satisfaction score
    
    demographics.append({
        'response_id': i+1,
        'age_group': age_group,
        'gender': gender,
        'visit_type': visit_type,
        'satisfaction_score': satisfaction
    })

# Create DataFrames
df_responses = pd.DataFrame({
    'response_id': range(1, len(responses)+1),
    'feedback_text': responses
})

df_demographics = pd.DataFrame(demographics)

print("Sample data created!")
print(f"Number of responses: {len(df_responses)}")
print("\nSample responses:")
print(df_responses.head(3))

Sample data created!
Number of responses: 10

Sample responses:
   response_id                                      feedback_text
0            1  The service at this clinic was excellent. Staf...
1            2  Wait times were too long. I had to sit for ove...
2            3  Doctors were knowledgeable but the facility ne...


In [4]:
## 3. NumPy Fundamentals with Data Preparation

# Convert Pandas Series to NumPy array for faster numerical operations
satisfaction_scores = np.array(df_demographics['satisfaction_score'])

print("\nSatisfaction Scores (first 10):", satisfaction_scores[:10])
print("Shape:", satisfaction_scores.shape)
print("Data Type:", satisfaction_scores.dtype)

# Basic NumPy statistics
print("\nBasic Statistics with NumPy:")
print("Mean satisfaction:", np.mean(satisfaction_scores))
print("Median satisfaction:", np.median(satisfaction_scores))
print("Standard deviation:", np.std(satisfaction_scores))
print("Min score:", np.min(satisfaction_scores))
print("Max score:", np.max(satisfaction_scores))

# Create a frequency distribution using NumPy
unique_scores, counts = np.unique(satisfaction_scores, return_counts=True)
print("\nFrequency Distribution:")
for score, count in zip(unique_scores, counts):
    print(f"Score {score}: {count} responses")

# Converting to percentiles with NumPy
percentiles = np.percentile(satisfaction_scores, [25, 50, 75, 90])
print("\nPercentiles:")
print(f"25th percentile: {percentiles[0]}")
print(f"50th percentile (median): {percentiles[1]}")
print(f"75th percentile: {percentiles[2]}")
print(f"90th percentile: {percentiles[3]}")

# Using NumPy for filtering
high_scores = satisfaction_scores[satisfaction_scores >= 8]
low_scores = satisfaction_scores[satisfaction_scores <= 3]

print(f"\nNumber of high scores (8-10): {len(high_scores)}")
print(f"Number of low scores (1-3): {len(low_scores)}")
print(f"Percentage of high scores: {len(high_scores)/len(satisfaction_scores)*100:.1f}%")


Satisfaction Scores (first 10): [ 8 10  8  8  2  2 10  9 10  3]
Shape: (10,)
Data Type: int64

Basic Statistics with NumPy:
Mean satisfaction: 7.0
Median satisfaction: 8.0
Standard deviation: 3.1622776601683795
Min score: 2
Max score: 10

Frequency Distribution:
Score 2: 2 responses
Score 3: 1 responses
Score 8: 3 responses
Score 9: 1 responses
Score 10: 3 responses

Percentiles:
25th percentile: 4.25
50th percentile (median): 8.0
75th percentile: 9.75
90th percentile: 10.0

Number of high scores (8-10): 7
Number of low scores (1-3): 3
Percentage of high scores: 70.0%


In [5]:
## 4. Pandas Data Manipulation and Analysis

# Merging our response and demographic DataFrames
df_merged = pd.merge(df_responses, df_demographics, on='response_id')
print("\nMerged DataFrame (first 3 rows):")
print(df_merged.head(3))

# Basic exploratory data analysis with Pandas
print("\nDataset Overview:")
print(df_merged.info())

print("\nSummary Statistics:")
print(df_merged.describe(include='all'))

# Group analysis with Pandas
print("\nSatisfaction by Visit Type:")
visit_satisfaction = df_merged.groupby('visit_type')['satisfaction_score'].agg(['mean', 'median', 'count', 'std'])
print(visit_satisfaction.sort_values('mean', ascending=False))

print("\nSatisfaction by Age Group:")
age_satisfaction = df_merged.groupby('age_group')['satisfaction_score'].agg(['mean', 'median', 'count', 'std'])
print(age_satisfaction.sort_values('mean', ascending=False))

print("\nSatisfaction by Gender:")
gender_satisfaction = df_merged.groupby('gender')['satisfaction_score'].agg(['mean', 'median', 'count', 'std'])
print(gender_satisfaction)

# Create satisfaction categories for analysis
def categorize_satisfaction(score):
    if score >= 8:
        return 'High (8-10)'
    elif score >= 5:
        return 'Medium (5-7)'
    else:
        return 'Low (1-4)'

df_merged['satisfaction_category'] = df_merged['satisfaction_score'].apply(categorize_satisfaction)

# Cross-tabulation with Pandas
print("\nCross-tabulation of Visit Type and Satisfaction Category:")
visit_sat_crosstab = pd.crosstab(
    df_merged['visit_type'], 
    df_merged['satisfaction_category'],
    normalize='index'  # Normalize to get percentages by row
) * 100  # Convert to percentages

print(visit_sat_crosstab.round(1))  # Round to 1 decimal place



Merged DataFrame (first 3 rows):
   response_id                                      feedback_text age_group  \
0            1  The service at this clinic was excellent. Staf...     45-54   
1            2  Wait times were too long. I had to sit for ove...     55-64   
2            3  Doctors were knowledgeable but the facility ne...     35-44   

   gender visit_type  satisfaction_score  
0  Female  Emergency                   8  
1  Female  Emergency                  10  
2   Other  Emergency                   8  

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   response_id         10 non-null     int64 
 1   feedback_text       10 non-null     object
 2   age_group           10 non-null     object
 3   gender              10 non-null     object
 4   visit_type          10 non-null     object
 5   satisfaction_sco

In [6]:
## 5. Summary of Analysis

print("\n=== INSIGHTS SUMMARY ===")
print(f"Total Responses Analyzed: {len(df_merged)}")
print(f"Average Satisfaction Score: {df_merged['satisfaction_score'].mean():.2f}/10")

print("\nSatisfaction Distribution:")
satisfaction_counts = df_merged['satisfaction_category'].value_counts()
for category, count in satisfaction_counts.items():
    percentage = count / len(df_merged) * 100
    print(f"  {category}: {count} responses ({percentage:.1f}%)")

print("\nVisit Type Distribution:")
visit_counts = df_merged['visit_type'].value_counts()
for visit_type, count in visit_counts.items():
    percentage = count / len(df_merged) * 100
    print(f"  {visit_type}: {count} responses ({percentage:.1f}%)")

print("\nAnalysis Complete!")


=== INSIGHTS SUMMARY ===
Total Responses Analyzed: 10
Average Satisfaction Score: 7.00/10

Satisfaction Distribution:
  High (8-10): 7 responses (70.0%)
  Low (1-4): 3 responses (30.0%)

Visit Type Distribution:
  Emergency: 4 responses (40.0%)
  Routine Checkup: 3 responses (30.0%)
  Primary Care: 2 responses (20.0%)
  Specialist: 1 responses (10.0%)

Analysis Complete!
