# Bangladesh Freedom Fighter Database - Sample Analysis

This notebook demonstrates basic analysis techniques for the Bangladesh Freedom Fighter Database.

## Dataset Overview
- **Records**: 205,280+ freedom fighters
- **Source**: Ministry of Liberation War Affairs, Bangladesh
- **Format**: JSON files with structured data
- **Coverage**: All 64 districts of Bangladesh

In [None]:
# Import required libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import numpy as np

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Loading the Dataset

Let's start by loading a sample of the fighter records.

In [None]:
def load_fighter_data(fighters_dir="fighters", sample_size=1000):
    """Load fighter data from JSON files"""
    fighters_data = []
    
    json_files = list(Path(fighters_dir).glob("*.json"))
    print(f"Found {len(json_files)} fighter records")
    
    # Load sample or all files
    files_to_load = json_files[:sample_size] if sample_size else json_files
    
    for json_file in files_to_load:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                fighters_data.append(data)
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    print(f"Successfully loaded {len(fighters_data)} records")
    return fighters_data

# Load sample data
fighters = load_fighter_data(sample_size=5000)  # Load 5000 records for analysis

## 2. Data Structure Exploration

In [None]:
# Examine the structure of a single record
if fighters:
    sample_fighter = fighters[0]
    print("Sample Fighter Record Structure:")
    print(json.dumps(sample_fighter, ensure_ascii=False, indent=2)[:1000] + "...")
    
    print("\nTop-level keys:")
    for key in sample_fighter.keys():
        print(f"- {key}: {type(sample_fighter[key])}")

## 3. Basic Statistics

In [None]:
# Convert to DataFrame for easier analysis
df = pd.json_normalize(fighters)

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {len(df.columns)}")
print("\nColumn Names:")
for col in df.columns:
    print(f"- {col}")

print(f"\nBasic Info:")
print(df.info())

## 4. Geographic Distribution Analysis

In [None]:
# Analyze district distribution
district_counts = df['basic_info.district'].value_counts().head(15)

plt.figure(figsize=(12, 8))
district_counts.plot(kind='bar')
plt.title('Top 15 Districts by Number of Freedom Fighters', fontsize=16, pad=20)
plt.xlabel('District (জেলা)', fontsize=12)
plt.ylabel('Number of Fighters', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("Top 10 Districts:")
for i, (district, count) in enumerate(district_counts.head(10).items(), 1):
    print(f"{i:2d}. {district}: {count:,} fighters")

## 5. Document Type Analysis

In [None]:
# Analyze supporting documents
all_documents = []
for fighter in fighters:
    for doc in fighter.get('prove_documents', []):
        all_documents.append(doc.get('document_type', 'Unknown'))

doc_counts = Counter(all_documents)

plt.figure(figsize=(12, 6))
doc_types = list(doc_counts.keys())
doc_values = list(doc_counts.values())

plt.pie(doc_values, labels=doc_types, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Supporting Document Types', fontsize=16, pad=20)
plt.axis('equal')
plt.show()

print("Document Type Distribution:")
for doc_type, count in doc_counts.most_common():
    percentage = (count / len(all_documents)) * 100
    print(f"- {doc_type}: {count:,} ({percentage:.1f}%)")

## 6. Family Relationship Analysis

In [None]:
# Analyze heir relationships
all_relationships = []
fighters_with_photos = 0
heirs_with_photos = 0

for fighter in fighters:
    # Count fighter photos
    if fighter.get('fighter_photo_url'):
        fighters_with_photos += 1
    
    # Analyze heir info
    for heir in fighter.get('waris_info', []):
        relationship = heir.get('relationship', 'Unknown')
        all_relationships.append(relationship)
        
        if heir.get('photo_url'):
            heirs_with_photos += 1

relationship_counts = Counter(all_relationships)

plt.figure(figsize=(10, 6))
rel_types = list(relationship_counts.keys())
rel_values = list(relationship_counts.values())

plt.bar(rel_types, rel_values)
plt.title('Distribution of Heir Relationships', fontsize=16, pad=20)
plt.xlabel('Relationship Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("Relationship Distribution:")
for relationship, count in relationship_counts.most_common():
    print(f"- {relationship}: {count:,}")

print(f"\nPhoto Availability:")
print(f"- Fighters with photos: {fighters_with_photos:,} ({fighters_with_photos/len(fighters)*100:.1f}%)")
print(f"- Heirs with photos: {heirs_with_photos:,}")

## 7. Data Quality Assessment

In [None]:
# Assess data completeness
print("Data Completeness Analysis:")
print("=" * 40)

# Basic info completeness
basic_info_fields = ['name', 'father_name', 'mother_name', 'district', 'upazila', 'village']
for field in basic_info_fields:
    col_name = f'basic_info.{field}'
    if col_name in df.columns:
        non_null_count = df[col_name].notna().sum()
        percentage = (non_null_count / len(df)) * 100
        print(f"- {field}: {non_null_count:,}/{len(df):,} ({percentage:.1f}%)")

# Document and heir statistics
fighters_with_docs = sum(1 for f in fighters if f.get('prove_documents'))
fighters_with_heirs = sum(1 for f in fighters if f.get('waris_info'))

print(f"\nAdditional Data:")
print(f"- Fighters with supporting documents: {fighters_with_docs:,} ({fighters_with_docs/len(fighters)*100:.1f}%)")
print(f"- Fighters with heir information: {fighters_with_heirs:,} ({fighters_with_heirs/len(fighters)*100:.1f}%)")

# Average documents per fighter
total_docs = sum(len(f.get('prove_documents', [])) for f in fighters)
avg_docs = total_docs / len(fighters) if fighters else 0
print(f"- Average documents per fighter: {avg_docs:.2f}")

# Average heirs per fighter
total_heirs = sum(len(f.get('waris_info', [])) for f in fighters)
avg_heirs = total_heirs / len(fighters) if fighters else 0
print(f"- Average heirs per fighter: {avg_heirs:.2f}")

## 8. Research Insights

Based on this sample analysis, we can derive several insights about the Bangladesh Liberation War freedom fighters:

In [None]:
print("Key Research Insights:")
print("=" * 50)

# Geographic insights
top_district = district_counts.index[0]
top_count = district_counts.iloc[0]
print(f"1. Geographic Distribution:")
print(f"   - Highest concentration: {top_district} ({top_count:,} fighters)")
print(f"   - Total districts represented: {df['basic_info.district'].nunique()}")

# Document insights
most_common_doc = doc_counts.most_common(1)[0]
print(f"\n2. Documentation:")
print(f"   - Most common document type: {most_common_doc[0]} ({most_common_doc[1]:,} instances)")
print(f"   - Total document types: {len(doc_counts)}")

# Family insights
most_common_rel = relationship_counts.most_common(1)[0] if relationship_counts else ("None", 0)
print(f"\n3. Family Structure:")
print(f"   - Most common heir relationship: {most_common_rel[0]} ({most_common_rel[1]:,} instances)")
print(f"   - Fighters with family records: {fighters_with_heirs:,}")

# Data quality insights
print(f"\n4. Data Quality:")
print(f"   - Records with photos: {fighters_with_photos:,} ({fighters_with_photos/len(fighters)*100:.1f}%)")
print(f"   - Complete basic info coverage: High (>90% for most fields)")
print(f"   - Documentation coverage: {fighters_with_docs/len(fighters)*100:.1f}%")

print(f"\n5. Dataset Scale:")
print(f"   - This analysis covers {len(fighters):,} records")
print(f"   - Full dataset contains 205,280+ records")
print(f"   - Estimated total size: 2.5GB+ (JSON files only)")

## 9. Next Steps for Analysis

This notebook demonstrates basic analysis techniques. For deeper insights, consider:

### Advanced Analysis Ideas:
1. **Temporal Analysis**: If birth/death dates available, analyze age distributions
2. **Network Analysis**: Map family connections and regional clusters
3. **Text Mining**: Extract patterns from Bengali names and locations
4. **Geographic Mapping**: Visualize fighter distributions on Bangladesh map
5. **Historical Correlation**: Cross-reference with historical battle locations

### Research Applications:
- Academic studies on Liberation War demographics
- Digital heritage preservation projects
- Educational resources and interactive exhibits
- Genealogical research for descendant families
- Statistical analysis of regional participation patterns

### Technical Extensions:
- Machine learning for name standardization
- Natural language processing for Bengali text
- Database optimization for large-scale queries
- Web API development for public access
- Mobile app development for field research

In [None]:
# Export summary statistics for further use
summary_stats = {
    'total_records_analyzed': len(fighters),
    'top_districts': district_counts.head(10).to_dict(),
    'document_types': dict(doc_counts),
    'relationship_types': dict(relationship_counts),
    'photo_coverage': {
        'fighters_with_photos': fighters_with_photos,
        'heirs_with_photos': heirs_with_photos,
        'total_fighters': len(fighters)
    },
    'data_quality': {
        'fighters_with_documents': fighters_with_docs,
        'fighters_with_heirs': fighters_with_heirs,
        'avg_documents_per_fighter': avg_docs,
        'avg_heirs_per_fighter': avg_heirs
    }
}

# Save summary
with open('analysis_summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary_stats, f, ensure_ascii=False, indent=2)

print("Analysis complete! Summary saved to 'analysis_summary.json'")
print(f"\nDataset Citation:")
print(f"Bangladesh Freedom Fighter Database (2025)")
print(f"Source: Ministry of Liberation War Affairs, Bangladesh")
print(f"URL: https://mis.molwa.gov.bd")