In [17]:
import re
import pandas as pd
import nltk
from transformers import pipeline
from collections import defaultdict , Counter
import csv
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
class FastHealthcareAnalyzer:

    def __init__(self):

        self.entity_keywords = {
            'doctor': ['dr.', 'dr ', 'doctor'],
            'nurse': ['nurse', 'nurses', 'nursing'],
            'facility': ['medical center', 'hospital', 'clinic', 'care facility', 
                        'surgical institute', 'health clinic', 'campus'],
            'surgery': ['removal', 'repair', 'fusion', 'ablation', 'biopsy', 'correction', 
                       'surgery', 'procedure'],
            'appointment': ['appointment', 'check in', 'check-in'],
            'parking': ['parking', 'park', 'parked']
        }

        self.positive_patterns = [
            'smooth', 'clear', 'comfortable', 'helpful', 'professional', 
            'well managed', 'easy', 'good', 'great', 'excellent',
            'acceptable but unremarkable', 'met basic expectations'
        ]

        self.negative_patterns = [
            'confused', 'frustrated', 'disorganized', 'annoyed', 'poorly',
            'unclear', 'confusing', 'late', 'rescheduled', 'inconsistent',
            'contradicts', 'several things went wrong', 'without explanation'
        ]

        self.neutral_patterns = [
            'nothing stood out', 'hard to judge', 'not sure', 'maybe',
            'could just be bad luck', 'acceptable but unremarkable',
            'met basic expectations', 'neither good nor bad'
        ]

    def extract_entities_fast(self, text):
        entities = []
        text_lower = text.lower()

        doctor_pattern = r'dr\.?\s+([a-z]+(?:\s+[a-z]+)?)'
        doctor_matches = re.finditer(doctor_pattern, text_lower)
        for match in doctor_matches:
            name = match.group(1).strip().title()
            entities.append(('doctor', f'Dr. {name}'))

        for keyword in self.entity_keywords['facility']:
            pattern = rf'(?:at|to|from)\s+([a-z\s]*{keyword})'
            matches = re.findall(pattern, text_lower)
            for facility in matches:
                if facility.strip():
                    entities.append(('facility', facility.strip().title()))

        for keyword in self.entity_keywords['surgery']:
            pattern = rf'(?:for|underwent|had)\s+([a-z\s]*{keyword})'
            matches = re.findall(pattern, text_lower)
            for surgery in matches:
                if surgery.strip():
                    entities.append(('surgery', surgery.strip()))

        if 'nurse' in text_lower or 'nursing' in text_lower:
            if 'one nurse' in text_lower or 'a nurse' in text_lower:
                entities.append(('nurse', 'nurse (individual)'))
            else:
                entities.append(('nurse', 'nursing staff'))

        if 'appointment' in text_lower:
            entities.append(('appointment', 'appointment'))

        if any(word in text_lower for word in ['parking', 'park', 'parked']):
            entities.append(('parking', 'parking'))

        unique_entities = []
        seen = set()
        for entity_type, entity_name in entities:
            key = f"{entity_type}:{entity_name}"
            if key not in seen:
                seen.add(key)
                unique_entities.append((entity_type, entity_name))

        return unique_entities

    def get_sentiment_fast(self, text, entity_context=None):

        if entity_context:
            text = entity_context

        text_lower = text.lower()

        for pattern in self.neutral_patterns:
            if pattern in text_lower:
                return 'neutral'

        pos_score = sum(1 for p in self.positive_patterns if p in text_lower)
        neg_score = sum(2 for p in self.negative_patterns if p in text_lower)  

        if any(neg in text_lower for neg in ['not good', 'not great', 'not helpful']):
            neg_score += 2

        if pos_score > neg_score:
            return 'positive'
        elif neg_score > pos_score:
            return 'negative'
        else:
            return 'neutral'

    def analyze_feedback_fast(self, feedback_id, feedback_text):
        if pd.isna(feedback_text) or str(feedback_text).strip() == '':
            return []

        overall_sentiment = self.get_sentiment_fast(feedback_text)

        entities = self.extract_entities_fast(feedback_text)

        results = []
        if entities:
            for entity_type, entity_name in entities:

                entity_sentiment = self.get_sentiment_fast(feedback_text)

                if entity_type == 'doctor':
                    if 'dr' in feedback_text.lower() and any(neg in feedback_text.lower() for neg in ['confused', 'frustrated', 'disorganized']):
                        entity_sentiment = 'negative'
                elif entity_type == 'parking':
                    if 'easy' in feedback_text.lower() and 'park' in feedback_text.lower():
                        entity_sentiment = 'positive'
                    elif any(neg in feedback_text.lower() for neg in ['far', 'longer', 'unclear', 'confusing']):
                        entity_sentiment = 'negative'
                elif entity_type == 'appointment':
                    if 'smooth' in feedback_text.lower():
                        entity_sentiment = 'positive'
                    elif any(neg in feedback_text.lower() for neg in ['rescheduled', 'late', 'confusing']):
                        entity_sentiment = 'negative'
                elif entity_type == 'nurse':
                    if 'helpful' in feedback_text.lower():
                        entity_sentiment = 'positive'
                    elif 'annoyed' in feedback_text.lower():
                        entity_sentiment = 'negative'

                results.append({
                    'feedback_id': feedback_id,
                    'overall_sentiment': overall_sentiment,
                    'entity': entity_name,
                    'entity_type': entity_type,
                    'entity_sentiment': entity_sentiment
                })
        else:

            results.append({
                'feedback_id': feedback_id,
                'overall_sentiment': overall_sentiment,
                'entity': 'no_entity',
                'entity_type': 'none',
                'entity_sentiment': 'none'
            })

        return results

In [19]:
def process_fast(input_file, output_file):

    print(f"Reading {input_file}...")
    df = pd.read_csv(input_file)

    analyzer = FastHealthcareAnalyzer()
    all_results = []

    print(f"Processing {len(df)} records...")
    for idx, row in df.iterrows():
        results = analyzer.analyze_feedback_fast(row['feedback_id'], row['feedback_text'])
        all_results.extend(results)

        if (idx + 1) % 100 == 0:
            print(f"Processed {idx + 1} records...")

    output_df = pd.DataFrame(all_results)

    output_df.to_csv(output_file, index=False)
    print(f"\n✓ Analysis complete! Saved to {output_file}")

    print("\n" + "="*50)
    print("QUICK SUMMARY")
    print("="*50)
    # print("\nOverall Sentiment:")
    print(output_df['overall_sentiment'].value_counts())
    print("\nEntity Types:")
    print(output_df['entity_type'].value_counts())
    print("\nEntity Sentiment:")
    print(output_df['entity_sentiment'].value_counts())

    return output_df

In [20]:
if __name__ == "__main__":

    print("="*60)
    print("HEALTHCARE FEEDBACK ANALYSIS - FAST VERSION")
    print("="*60)

    results = process_fast("C:/Users/HP/Downloads/healthcare_feedback.csv", "healthcare_feedback_results_fast.csv")

    print("\n" + "="*50)
    print("ENTITY SENTIMENT BY TYPE")
    print("="*50)

    pivot = pd.crosstab(
        results[results['entity_type'] != 'none']['entity_type'],
        results[results['entity_type'] != 'none']['entity_sentiment']
    )
    print(pivot)

    doctor_results = results[results['entity_type'] == 'doctor']
    if not doctor_results.empty:
        print("\n" + "="*50)
        print("DOCTOR SENTIMENT SUMMARY")
        print("="*50)
        doctor_summary = doctor_results.groupby('entity')['entity_sentiment'].value_counts().unstack().fillna(0)
        doctor_summary['total'] = doctor_summary.sum(axis=1)
        doctor_summary = doctor_summary.sort_values('total', ascending=False)
        print(doctor_summary.head(10))

    facility_results = results[results['entity_type'] == 'facility']
    if not facility_results.empty:
        print("\n" + "="*50)
        print("FACILITY SENTIMENT SUMMARY")
        print("="*50)
        facility_summary = facility_results.groupby('entity')['entity_sentiment'].value_counts().unstack().fillna(0)
        facility_summary['total'] = facility_summary.sum(axis=1)
        facility_summary = facility_summary.sort_values('total', ascending=False)
        print(facility_summary.head(10))

HEALTHCARE FEEDBACK ANALYSIS - FAST VERSION
Reading C:/Users/HP/Downloads/healthcare_feedback.csv...


Processing 999 records...
Processed 100 records...
Processed 200 records...
Processed 300 records...
Processed 400 records...
Processed 500 records...
Processed 600 records...
Processed 700 records...
Processed 800 records...
Processed 900 records...

✓ Analysis complete! Saved to healthcare_feedback_results_fast.csv

QUICK SUMMARY
overall_sentiment
neutral     1997
negative    1019
positive     310
Name: count, dtype: int64

Entity Types:
entity_type
doctor         720
surgery        682
facility       621
appointment    600
nurse          392
parking        302
none             9
Name: count, dtype: int64

Entity Sentiment:
entity_sentiment
negative    1437
neutral     1338
positive     542
none           9
Name: count, dtype: int64

ENTITY SENTIMENT BY TYPE
entity_sentiment  negative  neutral  positive
entity_type                                  
appointment            460        0       140
doctor                 262      391        67
facility               188      371        62

In [22]:
dfr = pd.read_csv('healthcare_feedback_results_fast.csv')

In [23]:
df = pd.read_csv("C:/Users/HP/Downloads/healthcare_feedback.csv")
results = pd.merge(df,dfr , on='feedback_id',how='left')

In [24]:
results.head()

Unnamed: 0,feedback_id,feedback_text,overall_sentiment,entity,entity_type,entity_sentiment
0,500,"I met Dr. Alan Moore, for laparoscopic gallbla...",neutral,Dr. Alan Moore,doctor,negative
1,500,"I met Dr. Alan Moore, for laparoscopic gallbla...",neutral,laparoscopic gallbladder removal,surgery,neutral
2,500,"I met Dr. Alan Moore, for laparoscopic gallbla...",neutral,nursing staff,nurse,neutral
3,500,"I met Dr. Alan Moore, for laparoscopic gallbla...",neutral,appointment,appointment,negative
4,145,"I met Dr. Riya Patel, at Riverside Health Clin...",negative,Dr. Riya Patel,doctor,negative


In [None]:
# def generate_insights_summary(results_file, output_file="healthcare_insights_summary.txt"):
    
#     df = pd.read_csv(results_file)
    
#     with open(output_file, 'w') as f:
#         f.write("="*80 + "\n")
#         f.write("HEALTHCARE FEEDBACK ANALYSIS - KEY INSIGHTS\n")
#         f.write("="*80 + "\n\n")
        
        
#         f.write("OVERALL STATISTICS:\n")
#         f.write("-"*40 + "\n")
#         total_feedback = df['feedback_id'].nunique()
#         f.write(f"Total Feedback Records: {total_feedback}\n")
#         f.write(f"Total Entity Mentions: {len(df[df['entity_type'] != 'none'])}\n\n")
        

#         f.write("SENTIMENT DISTRIBUTION:\n")
#         f.write("-"*40 + "\n")
#         sentiment_dist = df[df['overall_sentiment'] != 'none']['overall_sentiment'].value_counts()
#         for sentiment, count in sentiment_dist.items():
#             pct = (count / total_feedback) * 100
#             f.write(f"{sentiment.upper()}: {count} ({pct:.1f}%)\n")
#         f.write("\n")
        

#         f.write("ENTITY BREAKDOWN:\n")
#         f.write("-"*40 + "\n")
#         entity_counts = df[df['entity_type'] != 'none']['entity_type'].value_counts()
#         for entity, count in entity_counts.items():
#             f.write(f"{entity.upper()}: {count} mentions\n")
#         f.write("\n")
        

#         doctors_df = df[df['entity_type'] == 'doctor']
#         if not doctors_df.empty:
#             f.write("DOCTOR PERFORMANCE:\n")
#             f.write("-"*40 + "\n")
#             positive_doctors = doctors_df[doctors_df['entity_sentiment'] == 'positive']['entity'].value_counts()
#             f.write("Top 5 Doctors with Positive Feedback:\n")
#             for doctor, count in positive_doctors.head(5).items():
#                 f.write(f"  • {doctor}: {count} positive mentions\n")
            

#             negative_doctors = doctors_df[doctors_df['entity_sentiment'] == 'negative']['entity'].value_counts()
#             f.write("\nTop 5 Doctors with Negative Feedback:\n")
#             for doctor, count in negative_doctors.head(5).items():
#                 f.write(f"  • {doctor}: {count} negative mentions\n")
#             f.write("\n")
        

#         facilities_df = df[df['entity_type'] == 'facility']
#         if not facilities_df.empty:
#             f.write("FACILITY PERFORMANCE:\n")
#             f.write("-"*40 + "\n")
            
#             facility_sentiment = facilities_df.groupby('entity')['entity_sentiment'].value_counts().unstack(fill_value=0)
#             facility_sentiment['positive_ratio'] = facility_sentiment.get('positive', 0) / facility_sentiment.sum(axis=1)
#             top_facilities = facility_sentiment.sort_values('positive_ratio', ascending=False).head(5)
            
#             f.write("Top 5 Facilities by Positive Feedback Ratio:\n")
#             for facility, row in top_facilities.iterrows():
#                 f.write(f"  • {facility}: {row.get('positive', 0)} positive, {row.get('negative', 0)} negative\n")
#             f.write("\n")
        

#         f.write("KEY IMPROVEMENT AREAS:\n")
#         f.write("-"*40 + "\n")
        

#         parking_df = df[df['entity_type'] == 'parking']
#         if not parking_df.empty:
#             parking_negative = len(parking_df[parking_df['entity_sentiment'] == 'negative'])
#             parking_total = len(parking_df)
#             if parking_total > 0:
#                 f.write(f"• Parking: {parking_negative}/{parking_total} mentions negative ({parking_negative/parking_total*100:.1f}%)\n")
        

#         appt_df = df[df['entity_type'] == 'appointment']
#         if not appt_df.empty:
#             appt_negative = len(appt_df[appt_df['entity_sentiment'] == 'negative'])
#             appt_total = len(appt_df)
#             if appt_total > 0:
#                 f.write(f"• Appointments: {appt_negative}/{appt_total} mentions negative ({appt_negative/appt_total*100:.1f}%)\n")
        

#         nurse_df = df[df['entity_type'] == 'nurse']
#         if not nurse_df.empty:
#             nurse_negative = len(nurse_df[nurse_df['entity_sentiment'] == 'negative'])
#             nurse_total = len(nurse_df)
#             if nurse_total > 0:
#                 f.write(f"• Nursing Staff: {nurse_negative}/{nurse_total} mentions negative ({nurse_negative/nurse_total*100:.1f}%)\n")
        
#         f.write("\n" + "="*80 + "\n")
#         f.write("END OF REPORT\n")
#         f.write("="*80 + "\n")
    
#     print(f"\n✓ Insights summary saved to {output_file}")


# # Run the complete pipeline
# if __name__ == "__main__":
#     # 1. to rtun the fast analysis
#     results = process_fast("C:/Users/HP/Downloads/healthcare_feedback.csv", "healthcare_feedback_analysis_results.csv")

#     generate_insights_summary("healthcare_feedback_analysis_results.csv")
    
#     print("\n" + "="*80)
#     print("Completed")
#     print("="*80)
#     print("\nOutput files generated:")
#     print("  1. healthcare_feedback_analysis_results.csv - Detailed entity-level results")
#     print("  2. healthcare_analysis_dashboard.png - Main visualization dashboard")
#     print("  3. healthcare_analysis_detailed_analysis.png - Detailed entity analysis")
#     print("  4. healthcare_insights_summary.txt - Text summary of key insights")

Reading C:/Users/HP/Downloads/healthcare_feedback.csv...
Processing 999 records...
Processed 100 records...
Processed 200 records...
Processed 300 records...
Processed 400 records...
Processed 500 records...
Processed 600 records...
Processed 700 records...
Processed 800 records...
Processed 900 records...

✓ Analysis complete! Saved to healthcare_feedback_analysis_results.csv

QUICK SUMMARY
overall_sentiment
neutral     1997
negative    1019
positive     310
Name: count, dtype: int64

Entity Types:
entity_type
doctor         720
surgery        682
facility       621
appointment    600
nurse          392
parking        302
none             9
Name: count, dtype: int64

Entity Sentiment:
entity_sentiment
negative    1437
neutral     1338
positive     542
none           9
Name: count, dtype: int64

✓ Insights summary saved to healthcare_insights_summary.txt

Completed

Output files generated:
  1. healthcare_feedback_analysis_results.csv - Detailed entity-level results
  2. healthcare_ana

In [None]:
# def create_visualizations(results_file, output_prefix="healthcare_analysis"):
    

#     plt.style.use('seaborn-v0_8-darkgrid')
#     # sns.set_palette("husl")
    

#     fig, axes = plt.subplots(2, 3, figsize=(18, 12))
#     fig.suptitle('Healthcare Feedback Analysis Dashboard', fontsize=16, fontweight='bold')
    

#     sentiment_counts = df[df['overall_sentiment'] != 'none']['overall_sentiment'].value_counts()
#     axes[0, 0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', 
#                    startangle=90, colors=['#2ecc71', '#e74c3c', '#95a5a6'])
#     axes[0, 0].set_title('Overall Sentiment Distribution', fontweight='bold')
    

#     entity_counts = df[df['entity_type'] != 'none']['entity_type'].value_counts()
#     axes[0, 1].bar(entity_counts.index, entity_counts.values, color=['#3498db', '#9b59b6', '#f1c40f', '#e67e22', '#1abc9c', '#e74c3c'])
#     axes[0, 1].set_title('Entity Types Mentioned', fontweight='bold')
#     axes[0, 1].set_xlabel('Entity Type')
#     axes[0, 1].set_ylabel('Count')
#     axes[0, 1].tick_params(axis='x', rotation=45)
    

#     entity_sentiment = df[df['entity_sentiment'] != 'none']['entity_sentiment'].value_counts()
#     axes[0, 2].bar(entity_sentiment.index, entity_sentiment.values, 
#                    color=['#2ecc71', '#e74c3c', '#95a5a6'])
#     axes[0, 2].set_title('Entity-Level Sentiment Distribution', fontweight='bold')
#     axes[0, 2].set_xlabel('Sentiment')
#     axes[0, 2].set_ylabel('Count')
    
#     # 2. Entity Type by Sentiment Heatmap
#     entity_type_sentiment = pd.crosstab(
#         df[df['entity_type'] != 'none']['entity_type'],
#         df[df['entity_sentiment'] != 'none']['entity_sentiment']
#     )
#     sns.heatmap(entity_type_sentiment, annot=True, fmt='d', cmap='YlOrRd', ax=axes[1, 0])
#     axes[1, 0].set_title('Entity Type × Sentiment Heatmap', fontweight='bold')
#     axes[1, 0].set_xlabel('Sentiment')
#     axes[1, 0].set_ylabel('Entity Type')
    

#     doctors_df = df[df['entity_type'] == 'doctor']
#     if not doctors_df.empty:
#         top_doctors = doctors_df['entity'].value_counts().head(8)
#         colors = ['#3498db' if s == 'positive' else '#e74c3c' for s in 
#                  doctors_df.groupby('entity')['entity_sentiment'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'neutral').head(8)]
#         axes[1, 1].barh(range(len(top_doctors)), top_doctors.values, color=colors)
#         axes[1, 1].set_yticks(range(len(top_doctors)))
#         axes[1, 1].set_yticklabels(top_doctors.index)
#         axes[1, 1].set_title('Top 8 Most Mentioned Doctors', fontweight='bold')
#         axes[1, 1].set_xlabel('Number of Mentions')
    

#     facilities_df = df[df['entity_type'] == 'facility']
#     if not facilities_df.empty:
#         top_facilities = facilities_df['entity'].value_counts().head(8)
#         axes[1, 2].barh(range(len(top_facilities)), top_facilities.values, 
#                         color=['#f39c12', '#16a085', '#27ae60', '#2980b9', '#8e44ad', '#d35400', '#c0392b', '#7f8c8d'][:len(top_facilities)])
#         axes[1, 2].set_yticks(range(len(top_facilities)))
#         axes[1, 2].set_yticklabels([f.split()[0] if len(f.split()) > 1 else f for f in top_facilities.index])
#         axes[1, 2].set_title('Top 8 Most Mentioned Facilities', fontweight='bold')
#         axes[1, 2].set_xlabel('Number of Mentions')
    
#     plt.tight_layout()
#     plt.savefig(f'{output_prefix}_dashboard.png', dpi=300, bbox_inches='tight')
#     plt.show()
    

#     fig, axes = plt.subplots(2, 2, figsize=(14, 10))
#     fig.suptitle('Detailed Entity Analysis', fontsize=16, fontweight='bold')
    

#     if not doctors_df.empty:
#         doctor_sentiment = doctors_df.groupby(['entity', 'entity_sentiment']).size().unstack(fill_value=0)
#         doctor_sentiment['total'] = doctor_sentiment.sum(axis=1)
#         doctor_sentiment = doctor_sentiment.sort_values('total', ascending=False).head(10)
#         doctor_sentiment[['positive', 'negative', 'neutral']].plot(kind='barh', stacked=True, 
#                                                                    color=['#2ecc71', '#e74c3c', '#95a5a6'], ax=axes[0, 0])
#         axes[0, 0].set_title('Top 10 Doctors by Sentiment Breakdown', fontweight='bold')
#         axes[0, 0].set_xlabel('Number of Mentions')
    

#     if not facilities_df.empty:
#         facility_sentiment = facilities_df.groupby(['entity', 'entity_sentiment']).size().unstack(fill_value=0)
#         facility_sentiment['total'] = facility_sentiment.sum(axis=1)
#         facility_sentiment = facility_sentiment.sort_values('total', ascending=False).head(10)
#         facility_sentiment[['positive', 'negative', 'neutral']].plot(kind='barh', stacked=True,
#                                                                      color=['#2ecc71', '#e74c3c', '#95a5a6'], ax=axes[0, 1])
#         axes[0, 1].set_title('Top 10 Facilities by Sentiment Breakdown', fontweight='bold')
#         axes[0, 1].set_xlabel('Number of Mentions')
    

#     surgery_df = df[df['entity_type'] == 'surgery']
#     if not surgery_df.empty:
#         surgery_counts = surgery_df['entity'].value_counts().head(8)
#         surgery_sentiment = surgery_df.groupby('entity')['entity_sentiment'].apply(lambda x: x.mode()[0] if not x.mode().empty else 'neutral')
#         colors = ['#2ecc71' if s == 'positive' else '#e74c3c' if s == 'negative' else '#95a5a6' for s in surgery_sentiment]
#         axes[1, 0].barh(range(len(surgery_counts)), surgery_counts.values, color=colors[:len(surgery_counts)])
#         axes[1, 0].set_yticks(range(len(surgery_counts)))
#         axes[1, 0].set_yticklabels([p[:20] + '...' if len(p) > 20 else p for p in surgery_counts.index])
#         axes[1, 0].set_title('Most Mentioned Procedures/Surgeries', fontweight='bold')
#         axes[1, 0].set_xlabel('Number of Mentions')
    
#     parking_appointment_df = df[df['entity_type'].isin(['parking', 'appointment', 'nurse'])]
#     if not parking_appointment_df.empty:
#         pa_sentiment = parking_appointment_df.groupby(['entity_type', 'entity_sentiment']).size().unstack(fill_value=0)
#         pa_sentiment.plot(kind='bar', color=['#2ecc71', '#e74c3c', '#95a5a6'], ax=axes[1, 1])
#         axes[1, 1].set_title('Sentiment for Parking, Appointments & Nurses', fontweight='bold')
#         axes[1, 1].set_xlabel('Entity Type')
#         axes[1, 1].set_ylabel('Count')
#         axes[1, 1].legend(title='Sentiment')
#         axes[1, 1].tick_params(axis='x', rotation=0)
    
    