In [None]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import ast

def calculate_recency_scores(df):
    df_copy = df.copy()
    current_date = datetime.now()
    
    for idx, row in df_copy.iterrows():
        scores = calculate_row_recency_scores(row, current_date)
        
        for score_col, score_val in scores.items():
            df_copy.at[idx, score_col] = score_val
    
    return df_copy

def calculate_row_recency_scores(row, current_date):
    scores = {}
    
    address_indices = []
    n = 0
    max_checks = 10  
    
    while n < max_checks:
        address_col = f'Address.{n}.completeAddress'
        if address_col in row and pd.notna(row[address_col]) and str(row[address_col]).strip() != '':
            address_indices.append(n)
        n += 1
    
    if not address_indices:
        return scores
    
    all_address_types = []
    for i in address_indices:
        types = get_address_types(row, i)
        all_address_types.extend(types)
    
    unique_types_count = len(set(all_address_types)) if all_address_types else 1
    
    for i in address_indices:
        score = calculate_single_address_score(row, i, current_date, unique_types_count)
        address_text = str(row[f'Address.{i}.completeAddress']).strip()
        
        scores[f'Address.{i}_recency_score'] = round(score, 2)
        scores[f'Address.{i}_address_text'] = address_text
    
    return scores

def get_address_types(row, address_index):
  
    prefix = f'Address.{address_index}'
    types_field = f'{prefix}.addressType'
    
    if types_field not in row or pd.isna(row[types_field]):
        return []
    
    types_value = row[types_field]
    
    if isinstance(types_value, list):
        return types_value
    elif isinstance(types_value, str):
        types_value = types_value.strip()
        if types_value.startswith('[') and types_value.endswith(']'):
            try:
                return ast.literal_eval(types_value)
            except:
                return [types_value.strip("[]'\"")]
        else:
            return [types_value]
    else:
        return []

def calculate_single_address_score(row, address_index, current_date, unique_types_count):
    """
    Calculate recency score for a single address
    """
    prefix = f'Address.{address_index}'
    
    # 1. Temporal Score (40%)
    temporal_score = calculate_temporal_score(row, prefix, current_date)
    
    # 2. Frequency Score (25%)
    frequency_score = calculate_frequency_score(row, prefix)
    
    # 3. Usage Pattern Score (20%)
    pattern_score = calculate_pattern_score(row, address_index)
    
    # 4. Consistency Score (10%)
    consistency_score = calculate_consistency_score(unique_types_count)
    
    # 5. Quality Impact (5%)
    quality_score = calculate_quality_score(row, prefix)
    
    # Final weighted calculation
    final_score = (
        temporal_score * 0.40 +
        frequency_score * 0.25 +
        pattern_score * 0.20 +
        consistency_score * 0.10 +
        quality_score * 0.05
    )
    
    return final_score

def calculate_temporal_score(row, prefix, current_date):
    delivery_date_field = f'{prefix}.lastDeliveryDate'
    
    if delivery_date_field not in row or pd.isna(row[delivery_date_field]):
        return 0
    
    try:
        delivery_date = pd.to_datetime(row[delivery_date_field])
        
        if delivery_date.tz is not None:
            delivery_date = delivery_date.tz_convert('UTC').tz_localize(None)
        
        if hasattr(current_date, 'tz') and current_date.tz is not None:
            current_date = current_date.tz_localize(None)
        
        days_diff = (current_date - delivery_date).days
        
        if days_diff <= 7:
            return 100
        elif days_diff <= 90:
            return 50 + 35 * math.exp(-0.05 * (days_diff - 7))
        elif days_diff <= 365:
            return max(5, 20 - 15 * ((days_diff - 90) / 275))
        else:
            return 5
    except Exception as e:
        return 0

def calculate_frequency_score(row, prefix):
    times_seen_field = f'{prefix}.timesSeen'
    
    if times_seen_field not in row or pd.isna(row[times_seen_field]):
        times_seen = 0
    else:
        times_seen = int(row[times_seen_field])
    
    if times_seen == 1:
        return 40
    elif times_seen <= 3:
        return 40 + (times_seen - 1) * 20
    elif times_seen <= 10:
        return 80 + (times_seen - 3) * 2.4
    else:
        return min(100, 97 + (times_seen - 10) * 0.3)

def calculate_pattern_score(row, address_index):
    """
    Calculate usage pattern score based on address type
    """
    address_types = get_address_types(row, address_index)
    
    type_scores = {
        'logisticsAddress': 90,        # High recency importance - active delivery
        'transportDlAddress': 75,      # Moderate - government registered, changes occasionally  
        'taxAddress': 80,              # High - tax filing addresses change when business moves
        'businessAddress': 85,         # High - business operations, moderate change frequency
        'temporaryAddress': 95,        # Highest - by definition temporary
        'billingAddress': 70,          # Moderate - billing addresses change less frequently
        'permanentAddress': 40         # Low - permanent by definition
    }
    
    if not address_types:
        return 60  
    
    scores = [type_scores.get(addr_type, 60) for addr_type in address_types]
    return max(scores)

def calculate_consistency_score(unique_types_count):
    if unique_types_count == 1:
        return 85
    elif unique_types_count == 2:
        return 70
    elif unique_types_count == 3:
        return 55
    else:
        return 40

def calculate_quality_score(row, prefix):
    completeness_field = f'{prefix}.completeAddress.addressCompletenessScore'
    
    if completeness_field not in row or pd.isna(row[completeness_field]):
        return 40  # Default low score
    
    completeness = float(row[completeness_field])
    
    if completeness >= 80:
        return 100
    elif completeness >= 60:
        return 85
    elif completeness >= 40:
        return 70
    elif completeness >= 20:
        return 55
    else:
        return 40

def analyze_results(df_with_scores):
    score_cols = [col for col in df_with_scores.columns if col.endswith('_recency_score')]
    address_text_cols = [col for col in df_with_scores.columns if col.endswith('_address_text')]
    
    if not score_cols:
        print("No recency scores found!")
        return
    
    print("=== RECENCY SCORE ANALYSIS ===\n")
    
    all_scores = []
    for col in score_cols:
        all_scores.extend(df_with_scores[col].dropna().tolist())
    
    if all_scores:
        print(f"Total addresses scored: {len(all_scores)}")
        print(f"Average recency score: {np.mean(all_scores):.2f}")
        print(f"Score range: {min(all_scores):.2f} - {max(all_scores):.2f}")
        print(f"Standard deviation: {np.std(all_scores):.2f}\n")
        
        print("Score Distribution:")
        print(f"Excellent (90-100): {sum(1 for s in all_scores if s >= 90)} addresses")
        print(f"Good (75-89): {sum(1 for s in all_scores if 75 <= s < 90)} addresses")
        print(f"Fair (60-74): {sum(1 for s in all_scores if 60 <= s < 75)} addresses")
        print(f"Poor (40-59): {sum(1 for s in all_scores if 40 <= s < 60)} addresses")
        print(f"Very Poor (0-39): {sum(1 for s in all_scores if s < 40)} addresses")
    
    print(f"\n=== SAMPLE RESULTS ===")
    
    display_cols = ['name', 'email']
    
    # Add score and address pairs in order
    for i in range(10):  # Check up to Address.9
        score_col = f'Address.{i}_recency_score'
        addr_col = f'Address.{i}_address_text'
        if score_col in df_with_scores.columns:
            display_cols.extend([score_col, addr_col])
    
    available_cols = [col for col in display_cols if col in df_with_scores.columns]
    
    sample_df = df_with_scores[available_cols].head(10)
    
    # Display in a more readable format
    for idx, row in sample_df.iterrows():
        print(f"\n--- {row['name']} ({row['email']}) ---")
        for col in available_cols:
            if col.endswith('_recency_score') and pd.notna(row[col]):
                addr_idx = col.split('.')[1].split('_')[0]
                addr_text_col = f'Address.{addr_idx}_address_text'
                addr_text = row.get(addr_text_col, 'N/A')
                print(f"  Score: {row[col]:.2f} | Address: {str(addr_text)[:80]}...")
        
        if idx >= 4:  # Show only first 5 rows in detail
            break
    
    return df_with_scores[score_cols].describe()

# Main execution function
def run_recency_analysis(file_path):
    """
    Main function to run the complete recency analysis
    """
    try:
        # Load the data
        print("Loading data...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
        
        # Check for address columns
        address_cols = [col for col in df.columns if 'Address.' in col and 'completeAddress' in col]
        print(f"Found {len(address_cols)} address columns: {address_cols[:5]}...")  # Show first 5
        
        # Calculate recency scores
        print("\nCalculating recency scores...")
        df_with_scores = calculate_recency_scores(df)
        
        # Analyze results
        stats = analyze_results(df_with_scores)
        
        return df_with_scores, stats
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None, None

# Usage example:
if __name__ == "__main__":
    # Run the analysis
    file_path = "DCB_AlternateAddress.csv"
    df_result, statistics = run_recency_analysis(file_path)
    
    if df_result is not None:
        print(f"\n=== DETAILED STATISTICS ===")
        print(statistics)

Loading data...
Loaded 4455 rows and 447 columns
Found 75 address columns: ['Address.0.completeAddress', 'Address.0.completeAddress.addressCompletenessScore', 'Address.0.completeAddress.inputAddressSimilarityScore', 'Address.1.completeAddress', 'Address.1.completeAddress.addressCompletenessScore']...

Calculating recency scores...


  df = pd.read_csv(file_path)


=== RECENCY SCORE ANALYSIS ===

Total addresses scored: 9435
Average recency score: 38.50
Score range: 28.75 - 61.27
Standard deviation: 4.38

Score Distribution:
Excellent (90-100): 0 addresses
Good (75-89): 0 addresses
Fair (60-74): 6 addresses
Poor (40-59): 3046 addresses
Very Poor (0-39): 6383 addresses

=== SAMPLE RESULTS ===

--- nan (mor27nov@gmail.com) ---
  Score: 41.00 | Address: 770/28 bharat colony rohtak near shella by pass chock.,House,Rohtak,Haryana,Indi...
  Score: 44.39 | Address: Sec 36 om enclave near toll tax makroli rohtak Gohana road,Rohtak,Haryana,India,...

--- nan (sajid9350249895@gmail.com) ---
  Score: 38.25 | Address: Masjid Hadi Ali Shah,Central Delhi,Delhi,India,110055,DL,IN...

--- nan (vishalkewat9755@gmail.com) ---
  Score: 38.25 | Address: 10 kaladev,10 kaladev,Kaladev dashera madan,Vidisha,Madhya Pradesh,India,464114,...
  Score: 38.25 | Address: Vidisha tashsil lateari thana kaladev,Dasehra Medan kaladev,Vidisha,Madhya Prade...

--- nan (nan) ---
  S

In [39]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import ast

def calculate_recency_scores(df):
    """
    Calculate recency scores for all Address.N.completeAddress fields in the dataframe
    """
    df_copy = df.copy()
    current_date = datetime.now()
    
    # Counter for detailed logging
    address_counter = 0
    
    print("\n" + "="*80)
    print("🔍 DETAILED SCORING BREAKDOWN FOR FIRST 3 ADDRESSES")
    print("="*80)
    
    # Process each row
    for idx, row in df_copy.iterrows():
        scores = calculate_row_recency_scores(row, current_date, address_counter)
        
        # Add scores as new columns
        for score_col, score_val in scores.items():
            df_copy.at[idx, score_col] = score_val
            
            # Update counter for logging
            if score_col.endswith('_recency_score'):
                address_counter += 1
    
    print("="*80)
    print("✅ DETAILED LOGGING COMPLETE")
    print("="*80 + "\n")
    
    return df_copy

def calculate_row_recency_scores(row, current_date, address_counter=0):
    """
    Calculate recency scores for all addresses in a single row
    """
    scores = {}
    
    # Find all valid addresses for this row
    address_indices = []
    n = 0
    max_checks = 10  # Reasonable limit to avoid infinite loops
    
    while n < max_checks:
        address_col = f'Address.{n}.completeAddress'
        if address_col in row and pd.notna(row[address_col]) and str(row[address_col]).strip() != '':
            address_indices.append(n)
        n += 1
    
    if not address_indices:
        return scores
    
    # Collect all address types for consistency scoring
    all_address_types = []
    for i in address_indices:
        types = get_address_types(row, i)
        all_address_types.extend(types)
    
    unique_types_count = len(set(all_address_types)) if all_address_types else 1
    
    # Calculate score and extract address text for each valid address
    for i in address_indices:
        score = calculate_single_address_score(row, i, current_date, unique_types_count, address_counter)
        address_text = str(row[f'Address.{i}.completeAddress']).strip()
        
        scores[f'Address.{i}_recency_score'] = round(score, 2)
        scores[f'Address.{i}_address_text'] = address_text
        
        address_counter += 1
    
    return scores

def get_address_types(row, address_index):
    """
    Extract address types from the addressType field
    """
    prefix = f'Address.{address_index}'
    types_field = f'{prefix}.addressType'
    
    if types_field not in row or pd.isna(row[types_field]):
        return []
    
    types_value = row[types_field]
    
    # Handle different formats
    if isinstance(types_value, list):
        return types_value
    elif isinstance(types_value, str):
        types_value = types_value.strip()
        if types_value.startswith('[') and types_value.endswith(']'):
            try:
                return ast.literal_eval(types_value)
            except:
                # If parsing fails, treat as single string
                return [types_value.strip("[]'\"")]
        else:
            return [types_value]
    else:
        return []

def calculate_single_address_score(row, address_index, current_date, unique_types_count, global_address_counter=0):
    """
    Calculate recency score for a single address
    """
    prefix = f'Address.{address_index}'
    
    # Enable detailed logging for first 3 addresses
    log_details = global_address_counter < 3
    
    if log_details:
        print(f"\n🏠 ADDRESS #{global_address_counter + 1} SCORING BREAKDOWN")
        print("-" * 60)
        # Handle potential NaN values in name and email
        user_name = row.get('name', 'Unknown')
        if pd.isna(user_name):
            user_name = row.get('nameoftheindividual', 'Unknown')
        if pd.isna(user_name):
            user_name = 'Unknown'
            
        user_email = row.get('email', 'No email')
        if pd.isna(user_email):
            user_email = 'No email'
            
        address_text = str(row.get(f'{prefix}.completeAddress', 'N/A'))
        print(f"👤 User: {user_name} ({user_email})")
        print(f"📍 Address: {address_text[:70]}...")
        print(f"🔢 Address Index: {prefix}")
        print("-" * 60)
    
    # 1. Temporal Score (40%)
    temporal_score = calculate_temporal_score(row, prefix, current_date, log_details)
    
    # 2. Frequency Score (25%)
    frequency_score = calculate_frequency_score(row, prefix, log_details)
    
    # 3. Usage Pattern Score (20%)
    pattern_score = calculate_pattern_score(row, address_index, log_details)
    
    # 4. Consistency Score (10%)
    consistency_score = calculate_consistency_score(unique_types_count, log_details)
    
    # 5. Quality Impact (5%)
    quality_score = calculate_quality_score(row, prefix, log_details)
    
    # Final weighted calculation
    final_score = (
        temporal_score * 0.40 +
        frequency_score * 0.25 +
        pattern_score * 0.20 +
        consistency_score * 0.10 +
        quality_score * 0.05
    )
    
    if log_details:
        print("\n🧮 FINAL CALCULATION:")
        print(f"   ({temporal_score:.1f} × 0.40) + ({frequency_score:.1f} × 0.25) + ({pattern_score:.1f} × 0.20) + ({consistency_score:.1f} × 0.10) + ({quality_score:.1f} × 0.05)")
        print(f"   = {temporal_score * 0.40:.2f} + {frequency_score * 0.25:.2f} + {pattern_score * 0.20:.2f} + {consistency_score * 0.10:.2f} + {quality_score * 0.05:.2f}")
        print(f"   = {final_score:.2f}")
        print(f"\n🎯 FINAL RECENCY SCORE: {final_score:.2f}")
        
        # Score interpretation
        if final_score >= 90:
            interpretation = "🌟 EXCELLENT - Highly recent and reliable"
        elif final_score >= 75:
            interpretation = "✅ GOOD - Recent and trustworthy"
        elif final_score >= 60:
            interpretation = "⚠️ FAIR - Moderately recent"
        elif final_score >= 40:
            interpretation = "🔶 POOR - Old but potentially valid"
        else:
            interpretation = "❌ VERY POOR - Very old or unreliable"
            
        print(f"📊 INTERPRETATION: {interpretation}")
        print("=" * 60)
    
    return final_score

def calculate_temporal_score(row, prefix, current_date, log_details=False):
    """
    Calculate temporal score based on last delivery date
    """
    delivery_date_field = f'{prefix}.lastDeliveryDate'
    
    if delivery_date_field not in row or pd.isna(row[delivery_date_field]):
        if log_details:
            print("⏰ 1. TEMPORAL SCORE (40% weight):")
            print(f"   ❌ No delivery date found in field: {delivery_date_field}")
            print("   📊 Score: 0.0 points")
        return 0
    
    try:
        delivery_date_raw = row[delivery_date_field]
        if log_details:
            print("⏰ 1. TEMPORAL SCORE (40% weight):")
            print(f"   📅 Raw delivery date value: '{delivery_date_raw}' (type: {type(delivery_date_raw)})")
        
        # Try multiple date parsing approaches
        delivery_date = None
        
        # Approach 1: Direct pandas parsing
        try:
            delivery_date = pd.to_datetime(delivery_date_raw)
            if log_details:
                print(f"   ✅ Successfully parsed with pd.to_datetime()")
        except:
            pass
        
        # Approach 2: Handle string dates with specific formats
        if delivery_date is None and isinstance(delivery_date_raw, str):
            try:
                delivery_date = pd.to_datetime(delivery_date_raw, format='%Y-%m-%dT%H:%M:%S.%fZ')
                if log_details:
                    print(f"   ✅ Successfully parsed with ISO format")
            except:
                try:
                    delivery_date = pd.to_datetime(delivery_date_raw, infer_datetime_format=True)
                    if log_details:
                        print(f"   ✅ Successfully parsed with inferred format")
                except:
                    pass
        
        if delivery_date is None:
            if log_details:
                print(f"   ❌ Failed to parse date: '{delivery_date_raw}'")
                print("   📊 Score: 0.0 points")
            return 0
        
        # Handle timezone issues - convert both dates to timezone-naive
        if delivery_date.tz is not None:
            # Convert timezone-aware delivery_date to UTC then remove timezone
            delivery_date = delivery_date.tz_convert('UTC').tz_localize(None)
            if log_details:
                print(f"   🌍 Converted timezone-aware date to UTC naive")
        
        # Ensure current_date is also timezone-naive
        if hasattr(current_date, 'tz') and current_date.tz is not None:
            current_date = current_date.tz_localize(None)
        
        days_diff = (current_date - delivery_date).days
        
        if days_diff <= 7:
            score = 100
            category = "Perfect - Within 7 days"
        elif days_diff <= 90:
            score = 50 + 35 * math.exp(-0.05 * (days_diff - 7))
            category = "Good - Exponential decay (8-90 days)"
        elif days_diff <= 365:
            score = max(5, 20 - 15 * ((days_diff - 90) / 275))
            category = "Fair - Linear decay (91-365 days)"
        else:
            score = 5
            category = "Poor - Over 1 year old"
            
        if log_details:
            print(f"   📅 Parsed delivery date: {delivery_date.strftime('%Y-%m-%d %H:%M:%S')}")
            print(f"   📆 Days ago: {days_diff}")
            print(f"   📈 Category: {category}")
            print(f"   📊 Score: {score:.1f} points")
        
        return score
        
    except Exception as e:
        if log_details:
            print(f"   ❌ Error parsing delivery date: {str(e)}")
            print(f"   📅 Raw value: '{row[delivery_date_field]}'")
            print("   📊 Score: 0.0 points")
        return 0

def calculate_frequency_score(row, prefix, log_details=False):
    """
    Calculate frequency score based on times seen
    """
    times_seen_field = f'{prefix}.timesSeen'
    
    if times_seen_field not in row or pd.isna(row[times_seen_field]):
        times_seen = 0
    else:
        # Handle both int64 and float64 data types
        try:
            times_seen = int(float(row[times_seen_field]))
        except (ValueError, TypeError):
            times_seen = 0
    
    if times_seen == 0:
        score = 0
        category = "No usage data - zero score"
    elif times_seen == 1:
        score = 40
        category = "Single use - potentially outdated"
    elif times_seen <= 3:
        score = 40 + (times_seen - 1) * 20
        category = f"Moderate confidence - {times_seen} uses"
    elif times_seen <= 10:
        score = 80 + (times_seen - 3) * 2.4
        category = f"Regular usage - {times_seen} uses"
    else:
        score = min(100, 97 + (times_seen - 10) * 0.3)
        category = f"High confidence - {times_seen} uses"
    
    if log_details:
        print("\n🔄 2. FREQUENCY SCORE (25% weight):")
        print(f"   🔢 Times seen: {times_seen}")
        print(f"   📈 Category: {category}")
        print(f"   📊 Score: {score:.1f} points")
    
    return score

def calculate_pattern_score(row, address_index, log_details=False):
    """
    Calculate usage pattern score based on address type
    """
    address_types = get_address_types(row, address_index)
    
    type_scores = {
        'logisticsAddress': 90,        # High recency importance - active delivery
        'transportDlAddress': 75,      # Moderate - government registered, changes occasionally  
        'taxAddress': 80,              # High - tax filing addresses change when business moves
        'businessAddress': 85,         # High - business operations, moderate change frequency
        'temporaryAddress': 95,        # Highest - by definition temporary
        'billingAddress': 70,          # Moderate - billing addresses change less frequently
        'permanentAddress': 40         # Low - permanent by definition
    }
    
    if not address_types:
        score = 60
        category = "Unknown type - default score"
        type_display = "None/Unknown"
    else:
        # Return the highest score among all types for this address
        scores = [type_scores.get(addr_type, 60) for addr_type in address_types]
        score = max(scores)
        
        # Find the type that gave the max score
        max_type = None
        for addr_type in address_types:
            if type_scores.get(addr_type, 60) == score:
                max_type = addr_type
                break
        
        category = f"{max_type} - {get_type_explanation(max_type)}"
        type_display = str(address_types)
    
    if log_details:
        print("\n🏠 3. USAGE PATTERN SCORE (20% weight):")
        print(f"   🏷️ Address types: {type_display}")
        print(f"   📈 Category: {category}")
        print(f"   📊 Score: {score:.1f} points")
    
    return score

def get_type_explanation(addr_type):
    """
    Get explanation for address type scoring
    """
    explanations = {
        'temporaryAddress': 'Needs frequent updates',
        'logisticsAddress': 'High recency importance',
        'billingAddress': 'Moderate importance',
        'permanentAddress': 'Stable, less critical',
        'transportDlAddress': 'Government registered'
    }
    return explanations.get(addr_type, 'Unknown type')

def calculate_consistency_score(unique_types_count, log_details=False):
    """
    Calculate consistency score based on number of unique address types
    """
    if unique_types_count == 1:
        score = 85
        category = "Excellent - Single consistent type"
    elif unique_types_count == 2:
        score = 70
        category = "Good - Minor inconsistency"
    elif unique_types_count == 3:
        score = 55
        category = "Fair - Moderate confusion"
    else:
        score = 40
        category = "Poor - High inconsistency"
    
    if log_details:
        print("\n📊 4. CONSISTENCY SCORE (10% weight):")
        print(f"   🔢 Unique address types across all user addresses: {unique_types_count}")
        print(f"   📈 Category: {category}")
        print(f"   📊 Score: {score:.1f} points")
    
    return score

def calculate_quality_score(row, prefix, log_details=False):
    """
    Calculate quality score based on address completeness
    """
    completeness_field = f'{prefix}.completeAddress.addressCompletenessScore'
    
    if completeness_field not in row or pd.isna(row[completeness_field]):
        score = 40
        category = "No completeness data - default low score"
        completeness = "N/A"
    else:
        try:
            completeness = float(row[completeness_field])
        except (ValueError, TypeError):
            completeness = 0.0
        
        if completeness >= 80:
            score = 100
            category = "Excellent completeness"
        elif completeness >= 60:
            score = 85
            category = "Good completeness"
        elif completeness >= 40:
            score = 70
            category = "Fair completeness"
        elif completeness >= 20:
            score = 55
            category = "Poor completeness"
        else:
            score = 40
            category = "Very poor completeness"
    
    if log_details:
        print("\n✅ 5. QUALITY IMPACT SCORE (5% weight):")
        if completeness != "N/A":
            print(f"   📋 Address completeness: {completeness:.2f}%")
        else:
            print(f"   📋 Address completeness: {completeness}")
        print(f"   📈 Category: {category}")
        print(f"   📊 Score: {score:.1f} points")
    
    return score

def analyze_results(df_with_scores):
    """
    Analyze and display the recency score results
    """
    # Find all recency score columns
    score_cols = [col for col in df_with_scores.columns if col.endswith('_recency_score')]
    address_text_cols = [col for col in df_with_scores.columns if col.endswith('_address_text')]
    
    if not score_cols:
        print("No recency scores found!")
        return
    
    print("=== RECENCY SCORE ANALYSIS ===\n")
    
    # Overall statistics
    all_scores = []
    for col in score_cols:
        all_scores.extend(df_with_scores[col].dropna().tolist())
    
    if all_scores:
        print(f"Total addresses scored: {len(all_scores)}")
        print(f"Average recency score: {np.mean(all_scores):.2f}")
        print(f"Score range: {min(all_scores):.2f} - {max(all_scores):.2f}")
        print(f"Standard deviation: {np.std(all_scores):.2f}\n")
        
        # Score distribution
        print("Score Distribution:")
        print(f"Excellent (90-100): {sum(1 for s in all_scores if s >= 90)} addresses")
        print(f"Good (75-89): {sum(1 for s in all_scores if 75 <= s < 90)} addresses")
        print(f"Fair (60-74): {sum(1 for s in all_scores if 60 <= s < 75)} addresses")
        print(f"Poor (40-59): {sum(1 for s in all_scores if 40 <= s < 60)} addresses")
        print(f"Very Poor (0-39): {sum(1 for s in all_scores if s < 40)} addresses")
    
    print(f"\n=== SAMPLE RESULTS ===")
    
    # Show sample results with both scores and addresses
    display_cols = ['name', 'email']
    
    # Add score and address pairs in order
    for i in range(10):  # Check up to Address.9
        score_col = f'Address.{i}_recency_score'
        addr_col = f'Address.{i}_address_text'
        if score_col in df_with_scores.columns:
            display_cols.extend([score_col, addr_col])
    
    available_cols = [col for col in display_cols if col in df_with_scores.columns]
    
    sample_df = df_with_scores[available_cols].head(10)
    
    # Display in a more readable format
    for idx, row in sample_df.iterrows():
        print(f"\n--- {row['name']} ({row['email']}) ---")
        for col in available_cols:
            if col.endswith('_recency_score') and pd.notna(row[col]):
                addr_idx = col.split('.')[1].split('_')[0]
                addr_text_col = f'Address.{addr_idx}_address_text'
                addr_text = row.get(addr_text_col, 'N/A')
                print(f"  Score: {row[col]:.2f} | Address: {str(addr_text)[:80]}...")
        
        if idx >= 4:  # Show only first 5 rows in detail
            break
    
    return df_with_scores[score_cols].describe()

# Main execution function
def run_recency_analysis(file_path):
    """
    Main function to run the complete recency analysis
    """
    try:
        # Load the data
        print("Loading data...")
        df = pd.read_csv(file_path)
        print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
        
        # Check for address columns
        address_cols = [col for col in df.columns if 'Address.' in col and 'completeAddress' in col]
        print(f"Found {len(address_cols)} address columns: {address_cols[:5]}...")  # Show first 5
        
        # Calculate recency scores
        print("\nCalculating recency scores...")
        df_with_scores = calculate_recency_scores(df)
        
        # Analyze results
        stats = analyze_results(df_with_scores)
        
        return df_with_scores, stats
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None, None

# Usage example:
if __name__ == "__main__":
    # Run the analysis
    file_path = "DCB_AlternateAddress.csv"
    df_result, statistics = run_recency_analysis(file_path)
    
    if df_result is not None:
        print(f"\n=== DETAILED STATISTICS ===")
        print(statistics)

Loading data...
Loaded 4455 rows and 447 columns
Found 75 address columns: ['Address.0.completeAddress', 'Address.0.completeAddress.addressCompletenessScore', 'Address.0.completeAddress.inputAddressSimilarityScore', 'Address.1.completeAddress', 'Address.1.completeAddress.addressCompletenessScore']...

Calculating recency scores...

🔍 DETAILED SCORING BREAKDOWN FOR FIRST 3 ADDRESSES

🏠 ADDRESS #1 SCORING BREAKDOWN
------------------------------------------------------------
👤 User:   mohit (mor27nov@gmail.com)
📍 Address: 770/28 bharat colony rohtak near shella by pass chock.,House,Rohtak,Ha...
🔢 Address Index: Address.0
------------------------------------------------------------
⏰ 1. TEMPORAL SCORE (40% weight):
   📅 Raw delivery date value: '2024-03-02T12:01:20.000Z' (type: <class 'str'>)
   ✅ Successfully parsed with pd.to_datetime()
   🌍 Converted timezone-aware date to UTC naive
   📅 Parsed delivery date: 2024-03-02 12:01:20
   📆 Days ago: 564
   📈 Category: Poor - Over 1 year old


  df = pd.read_csv(file_path)


✅ DETAILED LOGGING COMPLETE

=== RECENCY SCORE ANALYSIS ===

Total addresses scored: 9435
Average recency score: 39.83
Score range: 31.75 - 62.27
Standard deviation: 4.00

Score Distribution:
Excellent (90-100): 0 addresses
Good (75-89): 0 addresses
Fair (60-74): 12 addresses
Poor (40-59): 4041 addresses
Very Poor (0-39): 5382 addresses

=== SAMPLE RESULTS ===

--- nan (mor27nov@gmail.com) ---
  Score: 42.00 | Address: 770/28 bharat colony rohtak near shella by pass chock.,House,Rohtak,Haryana,Indi...
  Score: 45.39 | Address: Sec 36 om enclave near toll tax makroli rohtak Gohana road,Rohtak,Haryana,India,...

--- nan (sajid9350249895@gmail.com) ---
  Score: 39.25 | Address: Masjid Hadi Ali Shah,Central Delhi,Delhi,India,110055,DL,IN...

--- nan (vishalkewat9755@gmail.com) ---
  Score: 39.25 | Address: 10 kaladev,10 kaladev,Kaladev dashera madan,Vidisha,Madhya Pradesh,India,464114,...
  Score: 39.25 | Address: Vidisha tashsil lateari thana kaladev,Dasehra Medan kaladev,Vidisha,Madhya P

In [None]:
file_path = "DCB_AlternateAddress.csv"

df_with_scores, statistics = run_recency_analysis(file_path)

if df_with_scores is not None:
    
    print("\n=== DETAILED BREAKDOWN FOR FIRST 3 ROWS ===")
    
    score_cols = [col for col in df_with_scores.columns if col.endswith('_recency_score')]
    
    for idx in range(min(3, len(df_with_scores))):
        row = df_with_scores.iloc[idx]
        print(f"\nRow {idx + 1} - {row.get('name', 'Unknown')} ({row.get('email', 'No email')}):")
        
        for score_col in score_cols:
            if pd.notna(row[score_col]):
                addr_idx = score_col.split('.')[1].split('_')[0]
                
                # Get the address text from new column
                addr_text_col = f'Address.{addr_idx}_address_text'
                address_text = row.get(addr_text_col, 'N/A')
                
                delivery_date = row.get(f'Address.{addr_idx}.lastDeliveryDate', 'No date')
                times_seen = row.get(f'Address.{addr_idx}.timesSeen', 0)
                addr_type = row.get(f'Address.{addr_idx}.addressType', 'Unknown')
                
                print(f"  {score_col}: {row[score_col]:.2f}")
                print(f"    Address: {str(address_text)[:80]}...")
                print(f"    Last delivery: {delivery_date}")
                print(f"    Times seen: {times_seen}")
                print(f"    Type: {addr_type}")
    
    print("\n=== CLEAN SUMMARY TABLE ===")
    summary_data = []
    
    for idx, row in df_with_scores.iterrows():
        base_info = {
            'Row': idx + 1,
            'Name': row.get('name', 'Unknown'),
            'Email': row.get('email', 'No email')
        }
        
        has_addresses = False
        for score_col in score_cols:
            if pd.notna(row[score_col]):
                has_addresses = True
                addr_idx = score_col.split('.')[1].split('_')[0]
                addr_text_col = f'Address.{addr_idx}_address_text'
                
                row_data = base_info.copy()
                row_data.update({
                    'Address_Index': f'Address.{addr_idx}',
                    'Recency_Score': row[score_col],
                    'Address_Text': str(row.get(addr_text_col, 'N/A'))[:60] + '...',
                    'Last_Delivery': row.get(f'Address.{addr_idx}.lastDeliveryDate', 'No date'),
                    'Times_Seen': row.get(f'Address.{addr_idx}.timesSeen', 0)
                })
                summary_data.append(row_data)
        
        if not has_addresses:
            summary_data.append(base_info)
    
    summary_df = pd.DataFrame(summary_data)
    print(summary_df.head(20).to_string(index=False))
    
    # Save complete results to CSV
    output_file = "DCB_AlternateAddress_with_recency_scores.csv"
    df_with_scores.to_csv(output_file, index=False)
    print(f"\n✅ Complete results saved to: {output_file}")
    
    # Create and save clean final dataframe with only essential columns
    essential_cols = ['email']
    
    # Add all recency score and address text columns
    score_cols = [col for col in df_with_scores.columns if col.endswith('_recency_score')]
    address_cols = [col for col in df_with_scores.columns if col.endswith('_address_text')]
    
    # Combine in pairs (score, address) for each address index
    for i in range(10):  # Check up to Address.9
        score_col = f'Address.{i}_recency_score'
        addr_col = f'Address.{i}_address_text'
        if score_col in df_with_scores.columns:
            essential_cols.extend([score_col, addr_col])
    
    # Create clean dataframe
    df_clean = df_with_scores[essential_cols].copy()
    
    # Save clean version
    clean_output_file = "DCB_Recency_Scores_FINAL.csv"
    df_clean.to_csv(clean_output_file, index=False)
    print(f"✅ Clean final dataframe saved to: {clean_output_file}")
    
    # Show preview of clean dataframe
    print(f"\n=== CLEAN FINAL DATAFRAME PREVIEW ===")
    print(f"Columns: {list(df_clean.columns)}")
    print(f"Shape: {df_clean.shape}")
    print("\nFirst 5 rows:")
    for idx, row in df_clean.head().iterrows():
        print(f"\nRow {idx + 1}: {row['email']}")
        for col in df_clean.columns:
            if col.endswith('_recency_score') and pd.notna(row[col]):
                addr_idx = col.split('.')[1].split('_')[0]
                addr_col = f'Address.{addr_idx}_address_text'
                print(f"  Score: {row[col]:.2f} | Address: {str(row.get(addr_col, 'N/A'))[:50]}...")
    
    # Show all new columns created
    print(f"\n=== NEW COLUMNS CREATED ===")
    new_cols = [col for col in df_with_scores.columns if '_recency_score' in col or '_address_text' in col]
    
    for col in sorted(new_cols):
        if col.endswith('_recency_score'):
            non_null_count = df_with_scores[col].count()
            avg_score = df_with_scores[col].mean() if non_null_count > 0 else 0
            print(f"{col}: {non_null_count} addresses, avg score: {avg_score:.2f}")
        elif col.endswith('_address_text'):
            non_null_count = df_with_scores[col].count()
            print(f"{col}: {non_null_count} address texts")

else:
    print("❌ Analysis failed. Please check your CSV file path and format.")

Loading data...
Loaded 4455 rows and 447 columns
Found 75 address columns: ['Address.0.completeAddress', 'Address.0.completeAddress.addressCompletenessScore', 'Address.0.completeAddress.inputAddressSimilarityScore', 'Address.1.completeAddress', 'Address.1.completeAddress.addressCompletenessScore']...

Calculating recency scores...


  df = pd.read_csv(file_path)


=== RECENCY SCORE ANALYSIS ===

Total addresses scored: 9435
Average recency score: 38.50
Score range: 28.75 - 61.27
Standard deviation: 4.38

Score Distribution:
Excellent (90-100): 0 addresses
Good (75-89): 0 addresses
Fair (60-74): 6 addresses
Poor (40-59): 3046 addresses
Very Poor (0-39): 6383 addresses

=== SAMPLE RESULTS ===

--- nan (mor27nov@gmail.com) ---
  Score: 41.00 | Address: 770/28 bharat colony rohtak near shella by pass chock.,House,Rohtak,Haryana,Indi...
  Score: 44.39 | Address: Sec 36 om enclave near toll tax makroli rohtak Gohana road,Rohtak,Haryana,India,...

--- nan (sajid9350249895@gmail.com) ---
  Score: 38.25 | Address: Masjid Hadi Ali Shah,Central Delhi,Delhi,India,110055,DL,IN...

--- nan (vishalkewat9755@gmail.com) ---
  Score: 38.25 | Address: 10 kaladev,10 kaladev,Kaladev dashera madan,Vidisha,Madhya Pradesh,India,464114,...
  Score: 38.25 | Address: Vidisha tashsil lateari thana kaladev,Dasehra Medan kaladev,Vidisha,Madhya Prade...

--- nan (nan) ---
  S

In [22]:
df1=pd.read_csv('DCB_AlternateAddress.csv')

  df1=pd.read_csv('DCB_AlternateAddress.csv')


In [23]:
df1

Unnamed: 0,Address,dob,email,name,nameoftheindividual,pan,pannumber,phoneNumber,Address.0.addressLine1,Address.0.addressLine2,...,Address.27.completeAddress,Address.27.completeAddress.addressCompletenessScore,Address.27.country,Address.27.countryCode,Address.27.email,Address.27.lastDeliveryDate,Address.27.pinCode,Address.27.state,Address.27.stateCode,Address.27.timesSeen
0,,,mor27nov@gmail.com,,mohit,,BBHPM7213G,919854005000,770/28 bharat colony rohtak near shella by pas...,"House,Rohtak,Haryana,India,124001,HR,IN",...,,,,,,,,,,
1,,,sajid9350249895@gmail.com,,sajid,,NVHPS1509D,919350249895,Masjid Hadi Ali Shah,"Central Delhi,Delhi,India,110055,DL,IN",...,,,,,,,,,,
2,,,vishalkewat9755@gmail.com,,vishal,,CIKPV3762Q,918519095564,10 kaladev,"10 kaladev,Kaladev dashera madan,Vidisha,Madhy...",...,,,,,,,,,,
3,EASLAND ENCLAVE BLOCK 5FLAT NO 502 ELAMKULAM K...,,,,a n abhimanyu,,BJYPA8676E,917012405231,EASLAND ENCLAVE,"BLOCK-5,FLAT-502,FIFTH FLOOR,S.A. ROAD,ELAMKUL...",...,,,,,,,,,,
4,,,,,a naveen,,BUWPN0014N,919080872911,Golden City Road,"1st cross,Collector office back side,Thanthoni...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4450,,,,,zulaikha naaz j,,AAIPZ7235Q,919940588865,owners court 6th floor 6E Montieth lane egmore...,"Chennai,Tamil Nadu,India,600008,TN,IN",...,,,,,,,,,,
4451,,,info@opal-stone.com,,zulfikar kasam momin,,AAHPM4235E,919137590133,101,LOVELY HOME SOCIETY VAISHALI NAGAR NEAR RUBY H...,...,,,,,,,,,,
4452,NOORANI BLDG 3RD FLR B WING FLAT NO 134THAKUPA...,,zulfikar.kerai@yahoo.in,,zulfikar shabbir kerai,,AYXPK8406D,919892457147,Poonam Residency Bldg no 104,"6th floor Room no 603 Shanti park,Near Balaji ...",...,,,,,,,,,,
4453,"VASANT OASIS/14/1406, MAKWANA ROADSEVEN HILLS ...",,z.dahodwala@microinks.com,,zuzer akberali dahodwala,,AAHPD0277F,919820031768,Zuzer Akberali Dahodwala,"Vasant Oasis/14/1406,Seven Hills Hospital,Mumb...",...,,,,,,,,,,


In [26]:
pd.set_option('display.max_rows', None)  
print(df1.dtypes)

Address                                                    object
dob                                                       float64
email                                                      object
name                                                      float64
nameoftheindividual                                        object
pan                                                       float64
pannumber                                                  object
phoneNumber                                                 int64
Address.0.addressLine1                                     object
Address.0.addressLine2                                     object
Address.0.addressType                                      object
Address.0.associatedName                                   object
Address.0.city                                             object
Address.0.completeAddress                                  object
Address.0.completeAddress.addressCompletenessScore        float64
Address.0.

In [38]:
address_type_counts = df1['Address.3.addressType'].value_counts()

print(address_type_counts)

Address.3.addressType
['logisticsAddress']                          511
['transportDlAddress']                         90
['taxAddress']                                 44
['businessAddress']                             3
['logisticsAddress', 'transportDlAddress']      1
Name: count, dtype: int64
