In [20]:
"""
LinkedIn Parquet Dataset Analysis Script
Analyzes 15.2GB parquet file with 20M rows for Semantic Talent Finder project

This script processes large parquet files in chunks to:
1. Extract schema and data type information
2. Analyze data quality and completeness
3. Generate insights for Java model optimization
4. Provide database schema recommendations
5. Configure processing pipeline parameters
"""

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import json
import os
from collections import Counter, defaultdict
from datetime import datetime
import gc
import psutil

# Configuration
PARQUET_FILE = "/Users/chromatrical/CAREER/Local Linkedin DB/DataBase/USA_filtered.parquet"
CHUNK_SIZE = 50000  # Process 50k rows at a time to manage memory
OUTPUT_DIR = "/Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output"

print("🚀 LinkedIn Parquet Dataset Analysis - Starting Setup...")
print(f"📁 Target File: {PARQUET_FILE}")
print(f"⭐ Chunk Size: {CHUNK_SIZE:,} rows")
print(f"💾 Output Directory: {OUTPUT_DIR}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Setup Complete - Output directory created at {OUTPUT_DIR}")

🚀 LinkedIn Parquet Dataset Analysis - Starting Setup...
📁 Target File: /Users/chromatrical/CAREER/Local Linkedin DB/DataBase/USA_filtered.parquet
⭐ Chunk Size: 50,000 rows
💾 Output Directory: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output
✅ Setup Complete - Output directory created at /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output


In [21]:
class LinkedInDataAnalyzer:
    def __init__(self, parquet_file_path, chunk_size=50000):
        self.parquet_file = parquet_file_path
        self.chunk_size = chunk_size
        self.insights = {
            'schema_analysis': {},
            'data_quality': {},
            'content_analysis': {},
            'business_logic': {},
            'processing_recommendations': {},
            'database_schema': {}
        }
        
        # Ensure output directory exists
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        
    def get_memory_usage(self):
        """Monitor memory usage during processing"""
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024  # MB
    
    def analyze_parquet_schema(self):
        """Analyze parquet file schema and metadata"""
        print("🔍 Analyzing Parquet Schema...")
        
        try:
            # Read parquet metadata without loading data
            parquet_file = pq.ParquetFile(self.parquet_file)
            schema = parquet_file.schema_arrow
            metadata = parquet_file.metadata
            
            # Extract schema information
            schema_info = {}
            for i, field in enumerate(schema):
                schema_info[field.name] = {
                    'type': str(field.type),
                    'nullable': field.nullable,
                    'index': i
                }
            
            self.insights['schema_analysis'] = {
                'total_columns': len(schema),
                'total_rows': metadata.num_rows,
                'file_size_gb': round(os.path.getsize(self.parquet_file) / (1024**3), 2),
                'columns': schema_info,
                'column_names': [field.name for field in schema]
            }
            
            print(f"✅ Schema Analysis Complete:")
            print(f"   - Total Rows: {metadata.num_rows:,}")
            print(f"   - Total Columns: {len(schema)}")
            print(f"   - File Size: {self.insights['schema_analysis']['file_size_gb']} GB")
            print(f"   - Columns: {', '.join(list(schema_info.keys())[:10])}...")
            
        except Exception as e:
            print(f"❌ Schema analysis failed: {e}")

# Initialize analyzer
analyzer = LinkedInDataAnalyzer(PARQUET_FILE, CHUNK_SIZE)
print("📊 LinkedInDataAnalyzer initialized successfully")

📊 LinkedInDataAnalyzer initialized successfully


In [22]:
# Run schema analysis
analyzer.analyze_parquet_schema()

# Display schema results
print("\n📋 Schema Summary:")
schema_info = analyzer.insights['schema_analysis']
print(f"Total Rows: {schema_info.get('total_rows', 0):,}")
print(f"Total Columns: {schema_info.get('total_columns', 0)}")
print(f"File Size: {schema_info.get('file_size_gb', 0)} GB")

print("\n🔍 Column Overview:")
columns = schema_info.get('columns', {})
for i, (col_name, col_info) in enumerate(list(columns.items())[:15]):  # Show first 15 columns
    nullable = "nullable" if col_info.get('nullable', True) else "not null"
    print(f"  {i+1:2d}. {col_name:<30} | {col_info.get('type', 'unknown'):<15} | {nullable}")

if len(columns) > 15:
    print(f"  ... and {len(columns) - 15} more columns")

🔍 Analyzing Parquet Schema...
✅ Schema Analysis Complete:
   - Total Rows: 51,352,619
   - Total Columns: 62
   - File Size: 15.15 GB
   - Columns: Full name, Industry, Job title, Sub Role, Industry 2, Emails, Mobile, Phone numbers, Company Name, Company Industry...

📋 Schema Summary:
Total Rows: 51,352,619
Total Columns: 62
File Size: 15.15 GB

🔍 Column Overview:
   1. Full name                      | string          | nullable
   2. Industry                       | string          | nullable
   3. Job title                      | string          | nullable
   4. Sub Role                       | string          | nullable
   5. Industry 2                     | string          | nullable
   6. Emails                         | string          | nullable
   7. Mobile                         | string          | nullable
   8. Phone numbers                  | string          | nullable
   9. Company Name                   | string          | nullable
  10. Company Industry               | 

In [23]:
def analyze_data_quality_chunked(self):
    """Analyze data quality in chunks to handle large file"""
    print("\n🔍 Analyzing Data Quality in Chunks...")
    
    # Initialize aggregators
    null_counts = defaultdict(int)
    total_counts = defaultdict(int)
    data_types = {}
    
    chunk_count = 0
    total_rows_processed = 0
    
    try:
        # Process file in chunks
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            chunk_count += 1
            total_rows_processed += len(chunk_df)
            
            # Analyze each column
            for column in chunk_df.columns:
                # Count nulls
                null_counts[column] += chunk_df[column].isnull().sum()
                total_counts[column] += len(chunk_df)
                
                # Store data type
                if column not in data_types:
                    data_types[column] = str(chunk_df[column].dtype)
            
            # Memory management
            del chunk_df
            gc.collect()
            
            if chunk_count % 50 == 0:
                print(f"   Processed {chunk_count} chunks ({total_rows_processed:,} rows)")
                print(f"   Memory usage: {self.get_memory_usage():.2f} MB")
            
            # Limit analysis for demo - analyze first 200k rows
            if chunk_count >= 4:
                break
        
        # Calculate null percentages
        null_percentages = {}
        for column in null_counts:
            null_percentages[column] = round((null_counts[column] / total_counts[column]) * 100, 2)
        
        self.insights['data_quality'] = {
            'total_rows_analyzed': total_rows_processed,
            'chunks_processed': chunk_count,
            'null_counts': dict(null_counts),
            'null_percentages': null_percentages,
            'data_types': data_types,
            'completeness_summary': {
                'high_quality_fields': [col for col, pct in null_percentages.items() if pct < 5],
                'medium_quality_fields': [col for col, pct in null_percentages.items() if 5 <= pct < 25],
                'low_quality_fields': [col for col, pct in null_percentages.items() if pct >= 25]
            }
        }
        
        print(f"✅ Data Quality Analysis Complete:")
        print(f"   - Rows Analyzed: {total_rows_processed:,}")
        print(f"   - High Quality Fields: {len(self.insights['data_quality']['completeness_summary']['high_quality_fields'])}")
        print(f"   - Low Quality Fields: {len(self.insights['data_quality']['completeness_summary']['low_quality_fields'])}")
        
    except Exception as e:
        print(f"❌ Data quality analysis failed: {e}")

# Add method to analyzer class
LinkedInDataAnalyzer.analyze_data_quality_chunked = analyze_data_quality_chunked

# Run data quality analysis
analyzer.analyze_data_quality_chunked()


🔍 Analyzing Data Quality in Chunks...
✅ Data Quality Analysis Complete:
   - Rows Analyzed: 200,000
   - High Quality Fields: 12
   - Low Quality Fields: 43


In [24]:
# Run data quality analysis
analyzer.analyze_data_quality_chunked()


🔍 Analyzing Data Quality in Chunks...
✅ Data Quality Analysis Complete:
   - Rows Analyzed: 200,000
   - High Quality Fields: 12
   - Low Quality Fields: 43


In [25]:
# Display data quality results
print("\n📊 Data Quality Analysis Results:")
quality_data = analyzer.insights['data_quality']

print(f"\nRows Analyzed: {quality_data.get('total_rows_analyzed', 0):,}")
print(f"Chunks Processed: {quality_data.get('chunks_processed', 0)}")

# Show field quality breakdown
completeness = quality_data.get('completeness_summary', {})
print(f"\n🟢 High Quality Fields ({len(completeness.get('high_quality_fields', []))}):")
for field in completeness.get('high_quality_fields', [])[:10]:
    null_pct = quality_data.get('null_percentages', {}).get(field, 0)
    print(f"  ✅ {field:<30} | {null_pct:5.1f}% null")

print(f"\n🟡 Medium Quality Fields ({len(completeness.get('medium_quality_fields', []))}):")
for field in completeness.get('medium_quality_fields', [])[:5]:
    null_pct = quality_data.get('null_percentages', {}).get(field, 0)
    print(f"  ⚠️  {field:<30} | {null_pct:5.1f}% null")

print(f"\n🔴 Low Quality Fields ({len(completeness.get('low_quality_fields', []))}):")
for field in completeness.get('low_quality_fields', [])[:5]:
    null_pct = quality_data.get('null_percentages', {}).get(field, 0)
    print(f"  ❌ {field:<30} | {null_pct:5.1f}% null")


📊 Data Quality Analysis Results:

Rows Analyzed: 200,000
Chunks Processed: 4

🟢 High Quality Fields (12):
  ✅ Full name                      |   0.0% null
  ✅ Location                       |   0.1% null
  ✅ Locality                       |   2.6% null
  ✅ Region                         |   2.2% null
  ✅ First Name                     |   0.1% null
  ✅ Last Name                      |   0.1% null
  ✅ LinkedIn Url                   |   0.1% null
  ✅ LinkedIn Username              |   0.1% null
  ✅ Location Country               |   0.1% null
  ✅ Location Continent             |   0.1% null

🟡 Medium Quality Fields (7):
  ⚠️  Industry                       |  13.8% null
  ⚠️  Job title                      |  17.0% null
  ⚠️  Metro                          |  11.2% null
  ⚠️  Gender                         |  16.1% null
  ⚠️  Last Updated                   |  18.6% null

🔴 Low Quality Fields (43):
  ❌ Sub Role                       |  76.1% null
  ❌ Industry 2                     |  60.

In [26]:
# First add the missing methods to the analyzer class
def generate_java_recommendations(self):
    """Generate Java model and configuration recommendations"""
    print("\n🔍 Generating Java Recommendations...")
    
    schema = self.insights.get('schema_analysis', {})
    quality = self.insights.get('data_quality', {})
    content = self.insights.get('content_analysis', {})
    
    # Generate Java field recommendations
    java_fields = {}
    for col_name, col_info in schema.get('columns', {}).items():
        field_name = self.to_camel_case(col_name)
        
        # Determine Java type and constraints
        if 'string' in col_info['type'].lower() or 'object' in col_info['type'].lower():
            # Get length recommendation from content analysis
            max_length = 255  # default
            if col_name in content.get('content_stats', {}):
                max_length = max(500, content['content_stats'][col_name].get('percentile_95', 255))
            
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'String',
                'jpa_annotation': f'@Column(name = "{col_name}", length = {max_length})',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
        
        elif 'int' in col_info['type'].lower():
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'Integer',
                'jpa_annotation': f'@Column(name = "{col_name}")',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
        
        elif 'bool' in col_info['type'].lower():
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'Boolean',
                'jpa_annotation': f'@Column(name = "{col_name}")',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
    
    # Processing recommendations
    processing_config = {
        'recommended_batch_size': min(5000, max(1000, self.chunk_size // 10)),
        'memory_per_batch_mb': round(self.get_memory_usage() / 10, 2),
        'estimated_processing_time_hours': round((schema.get('total_rows', 0) / 10000) / 60, 2),
        'high_priority_fields': quality.get('completeness_summary', {}).get('high_quality_fields', []),
        'validation_required_fields': quality.get('completeness_summary', {}).get('low_quality_fields', [])
    }
    
    self.insights['processing_recommendations'] = {
        'java_fields': java_fields,
        'processing_config': processing_config
    }
    
    print(f"✅ Java Recommendations Generated:")
    print(f"   - Java Fields: {len(java_fields)}")
    print(f"   - Recommended Batch Size: {processing_config['recommended_batch_size']}")

def to_camel_case(self, snake_str):
    """Convert snake_case to camelCase"""
    components = snake_str.split('_')
    return components[0] + ''.join(word.capitalize() for word in components[1:])

def generate_database_schema(self):
    """Generate optimized database schema"""
    print("\n🔍 Generating Database Schema...")
    
    java_fields = self.insights.get('processing_recommendations', {}).get('java_fields', {})
    
    # Generate CREATE TABLE statement
    create_table = "CREATE TABLE profiles (\n"
    create_table += "    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),\n"
    
    for field_name, field_info in list(java_fields.items())[:30]:  # Limit to first 30 fields for demo
        col_name = field_info['original_column']
        java_type = field_info['java_type']
        null_pct = field_info['null_percentage']
        
        # Determine SQL type and constraints
        if java_type == 'String':
            length = field_info['jpa_annotation'].split('length = ')[1].split(')')[0] if 'length =' in field_info['jpa_annotation'] else '255'
            sql_type = f"VARCHAR({length})"
        elif java_type == 'Integer':
            sql_type = "INTEGER"
        elif java_type == 'Boolean':
            sql_type = "BOOLEAN"
        else:
            sql_type = "TEXT"
        
        # Add NOT NULL for high-quality fields
        nullable = "" if null_pct > 10 else " NOT NULL" if null_pct < 1 else ""
        
        create_table += f"    {col_name} {sql_type}{nullable},\n"
    
    # Add vector embedding column
    create_table += "    embedding vector(1536),\n"
    create_table += "    created_at TIMESTAMP DEFAULT NOW(),\n"
    create_table += "    updated_at TIMESTAMP DEFAULT NOW()\n"
    create_table += ");"
    
    # Generate indexes
    indexes = []
    indexes.append("CREATE INDEX CONCURRENTLY profiles_embedding_hnsw_idx ON profiles USING hnsw (embedding vector_cosine_ops);")
    
    # Add indexes for high-quality, commonly queried fields
    high_quality_fields = self.insights.get('data_quality', {}).get('completeness_summary', {}).get('high_quality_fields', [])
    for field in high_quality_fields[:5]:  # Top 5 fields
        if field != 'id':
            indexes.append(f"CREATE INDEX CONCURRENTLY idx_profiles_{field} ON profiles({field});")
    
    self.insights['database_schema'] = {
        'create_table_sql': create_table,
        'indexes_sql': indexes
    }
    
    print(f"✅ Database Schema Generated")

def save_insights(self):
    """Save all insights to files"""
    print("\n💾 Saving Analysis Results...")
    
    # Save JSON insights
    insights_file = os.path.join(OUTPUT_DIR, 'linkedin_analysis_insights.json')
    with open(insights_file, 'w') as f:
        json.dump(self.insights, f, indent=2, default=str)
    
    # Save database schema
    schema_file = os.path.join(OUTPUT_DIR, 'optimized_schema.sql')
    with open(schema_file, 'w') as f:
        f.write(self.insights.get('database_schema', {}).get('create_table_sql', ''))
        f.write('\n\n-- Indexes\n')
        for index in self.insights.get('database_schema', {}).get('indexes_sql', []):
            f.write(index + '\n')
    
    print(f"✅ Results Saved to: {OUTPUT_DIR}")
    print(f"   - Insights: linkedin_analysis_insights.json")
    print(f"   - Database Schema: optimized_schema.sql")
    
    return insights_file, schema_file

# Add methods to analyzer class
LinkedInDataAnalyzer.generate_java_recommendations = generate_java_recommendations
LinkedInDataAnalyzer.to_camel_case = to_camel_case
LinkedInDataAnalyzer.generate_database_schema = generate_database_schema
LinkedInDataAnalyzer.save_insights = save_insights

# Now generate final insights and outputs
analyzer.generate_java_recommendations()
analyzer.generate_database_schema()
insights_file, schema_file = analyzer.save_insights()

# Display final summary
print("\n" + "="*80)
print("🎉 LINKEDIN DATASET ANALYSIS COMPLETE")
print("="*80)

schema_info = analyzer.insights['schema_analysis']
quality_info = analyzer.insights['data_quality']
processing_info = analyzer.insights['processing_recommendations']['processing_config']

print(f"\n📊 DATASET OVERVIEW:")
print(f"   📁 File Size: {schema_info.get('file_size_gb', 0)} GB")
print(f"   📋 Total Rows: {schema_info.get('total_rows', 0):,}")
print(f"   🗂️  Total Columns: {schema_info.get('total_columns', 0)}")
print(f"   🔍 Rows Analyzed: {quality_info.get('total_rows_analyzed', 0):,}")

print(f"\n🎯 DATA QUALITY SUMMARY:")
completeness = quality_info.get('completeness_summary', {})
print(f"   🟢 High Quality Fields: {len(completeness.get('high_quality_fields', []))}")
print(f"   🟡 Medium Quality Fields: {len(completeness.get('medium_quality_fields', []))}")
print(f"   🔴 Low Quality Fields: {len(completeness.get('low_quality_fields', []))}")

print(f"\n⚙️ PROCESSING RECOMMENDATIONS:")
print(f"   📦 Recommended Batch Size: {processing_info.get('recommended_batch_size', 0):,}")
print(f"   💾 Memory per Batch: {processing_info.get('memory_per_batch_mb', 0):.1f} MB")
print(f"   ⏱️  Est. Processing Time: {processing_info.get('estimated_processing_time_hours', 0):.1f} hours")

print(f"\n📁 OUTPUT FILES GENERATED:")
print(f"   📋 Analysis Report: {insights_file}")
print(f"   🗃️  Database Schema: {schema_file}")

print(f"\n🚀 NEXT STEPS:")
print("   1. Review the generated analysis files")
print("   2. Use the database schema for your PostgreSQL setup")
print("   3. Apply the processing recommendations to your Java application")
print("   4. Use the identified high-quality fields for core functionality")

print("\n" + "="*80)


🔍 Generating Java Recommendations...
✅ Java Recommendations Generated:
   - Java Fields: 62
   - Recommended Batch Size: 5000

🔍 Generating Database Schema...
✅ Database Schema Generated

💾 Saving Analysis Results...
✅ Results Saved to: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output
   - Insights: linkedin_analysis_insights.json
   - Database Schema: optimized_schema.sql

🎉 LINKEDIN DATASET ANALYSIS COMPLETE

📊 DATASET OVERVIEW:
   📁 File Size: 15.15 GB
   📋 Total Rows: 51,352,619
   🗂️  Total Columns: 62
   🔍 Rows Analyzed: 200,000

🎯 DATA QUALITY SUMMARY:
   🟢 High Quality Fields: 12
   🟡 Medium Quality Fields: 7
   🔴 Low Quality Fields: 43

⚙️ PROCESSING RECOMMENDATIONS:
   📦 Recommended Batch Size: 5,000
   💾 Memory per Batch: 67.2 MB
   ⏱️  Est. Processing Time: 85.6 hours

📁 OUTPUT FILES GENERATED:
   📋 Analysis Report: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output/linkedin_analysis_insights.json
   🗃️ 

In [27]:
def generate_java_recommendations(self):
    """Generate Java model and configuration recommendations"""
    print("\n🔍 Generating Java Recommendations...")
    
    schema = self.insights.get('schema_analysis', {})
    quality = self.insights.get('data_quality', {})
    content = self.insights.get('content_analysis', {})
    
    # Generate Java field recommendations
    java_fields = {}
    for col_name, col_info in schema.get('columns', {}).items():
        field_name = self.to_camel_case(col_name)
        
        # Determine Java type and constraints
        if 'string' in col_info['type'].lower() or 'object' in col_info['type'].lower():
            # Get length recommendation from content analysis
            max_length = 255  # default
            if col_name in content.get('content_stats', {}):
                max_length = max(500, content['content_stats'][col_name].get('percentile_95', 255))
            
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'String',
                'jpa_annotation': f'@Column(name = "{col_name}", length = {max_length})',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
        
        elif 'int' in col_info['type'].lower():
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'Integer',
                'jpa_annotation': f'@Column(name = "{col_name}")',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
        
        elif 'bool' in col_info['type'].lower():
            java_fields[field_name] = {
                'original_column': col_name,
                'java_type': 'Boolean',
                'jpa_annotation': f'@Column(name = "{col_name}")',
                'nullable': col_info.get('nullable', True),
                'null_percentage': quality.get('null_percentages', {}).get(col_name, 0)
            }
    
    # Processing recommendations
    processing_config = {
        'recommended_batch_size': min(5000, max(1000, self.chunk_size // 10)),
        'memory_per_batch_mb': round(self.get_memory_usage() / 10, 2),
        'estimated_processing_time_hours': round((schema.get('total_rows', 0) / 10000) / 60, 2),
        'high_priority_fields': quality.get('completeness_summary', {}).get('high_quality_fields', []),
        'validation_required_fields': quality.get('completeness_summary', {}).get('low_quality_fields', [])
    }
    
    self.insights['processing_recommendations'] = {
        'java_fields': java_fields,
        'processing_config': processing_config
    }
    
    print(f"✅ Java Recommendations Generated:")
    print(f"   - Java Fields: {len(java_fields)}")
    print(f"   - Recommended Batch Size: {processing_config['recommended_batch_size']}")

def to_camel_case(self, snake_str):
    """Convert snake_case to camelCase"""
    components = snake_str.split('_')
    return components[0] + ''.join(word.capitalize() for word in components[1:])

def generate_database_schema(self):
    """Generate optimized database schema"""
    print("\n🔍 Generating Database Schema...")
    
    java_fields = self.insights.get('processing_recommendations', {}).get('java_fields', {})
    
    # Generate CREATE TABLE statement
    create_table = "CREATE TABLE profiles (\n"
    create_table += "    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),\n"
    
    for field_name, field_info in list(java_fields.items())[:30]:  # Limit to first 30 fields for demo
        col_name = field_info['original_column']
        java_type = field_info['java_type']
        null_pct = field_info['null_percentage']
        
        # Determine SQL type and constraints
        if java_type == 'String':
            length = field_info['jpa_annotation'].split('length = ')[1].split(')')[0] if 'length =' in field_info['jpa_annotation'] else '255'
            sql_type = f"VARCHAR({length})"
        elif java_type == 'Integer':
            sql_type = "INTEGER"
        elif java_type == 'Boolean':
            sql_type = "BOOLEAN"
        else:
            sql_type = "TEXT"
        
        # Add NOT NULL for high-quality fields
        nullable = "" if null_pct > 10 else " NOT NULL" if null_pct < 1 else ""
        
        create_table += f"    {col_name} {sql_type}{nullable},\n"
    
    # Add vector embedding column
    create_table += "    embedding vector(1536),\n"
    create_table += "    created_at TIMESTAMP DEFAULT NOW(),\n"
    create_table += "    updated_at TIMESTAMP DEFAULT NOW()\n"
    create_table += ");"
    
    # Generate indexes
    indexes = []
    indexes.append("CREATE INDEX CONCURRENTLY profiles_embedding_hnsw_idx ON profiles USING hnsw (embedding vector_cosine_ops);")
    
    # Add indexes for high-quality, commonly queried fields
    high_quality_fields = self.insights.get('data_quality', {}).get('completeness_summary', {}).get('high_quality_fields', [])
    for field in high_quality_fields[:5]:  # Top 5 fields
        if field != 'id':
            indexes.append(f"CREATE INDEX CONCURRENTLY idx_profiles_{field} ON profiles({field});")
    
    self.insights['database_schema'] = {
        'create_table_sql': create_table,
        'indexes_sql': indexes
    }
    
    print(f"✅ Database Schema Generated")

def save_insights(self):
    """Save all insights to files"""
    print("\n💾 Saving Analysis Results...")
    
    # Save JSON insights
    insights_file = os.path.join(OUTPUT_DIR, 'linkedin_analysis_insights.json')
    with open(insights_file, 'w') as f:
        json.dump(self.insights, f, indent=2, default=str)
    
    # Save database schema
    schema_file = os.path.join(OUTPUT_DIR, 'optimized_schema.sql')
    with open(schema_file, 'w') as f:
        f.write(self.insights.get('database_schema', {}).get('create_table_sql', ''))
        f.write('\n\n-- Indexes\n')
        for index in self.insights.get('database_schema', {}).get('indexes_sql', []):
            f.write(index + '\n')
    
    print(f"✅ Results Saved to: {OUTPUT_DIR}")
    print(f"   - Insights: linkedin_analysis_insights.json")
    print(f"   - Database Schema: optimized_schema.sql")
    
    return insights_file, schema_file

# Add methods to analyzer class
LinkedInDataAnalyzer.generate_java_recommendations = generate_java_recommendations
LinkedInDataAnalyzer.to_camel_case = to_camel_case
LinkedInDataAnalyzer.generate_database_schema = generate_database_schema
LinkedInDataAnalyzer.save_insights = save_insights

print("✅ Methods added to LinkedInDataAnalyzer class")

✅ Methods added to LinkedInDataAnalyzer class


In [28]:
# Now run the complete analysis pipeline
analyzer.generate_java_recommendations()
analyzer.generate_database_schema()
insights_file, schema_file = analyzer.save_insights()

# Display final summary
print("\n" + "="*80)
print("🎉 LINKEDIN DATASET ANALYSIS COMPLETE")
print("="*80)

schema_info = analyzer.insights['schema_analysis']
quality_info = analyzer.insights['data_quality']
processing_info = analyzer.insights['processing_recommendations']['processing_config']

print(f"\n📊 DATASET OVERVIEW:")
print(f"   📁 File Size: {schema_info.get('file_size_gb', 0)} GB")
print(f"   📋 Total Rows: {schema_info.get('total_rows', 0):,}")
print(f"   🗂️  Total Columns: {schema_info.get('total_columns', 0)}")
print(f"   🔍 Rows Analyzed: {quality_info.get('total_rows_analyzed', 0):,}")

print(f"\n🎯 DATA QUALITY SUMMARY:")
completeness = quality_info.get('completeness_summary', {})
print(f"   🟢 High Quality Fields: {len(completeness.get('high_quality_fields', []))}")
print(f"   🟡 Medium Quality Fields: {len(completeness.get('medium_quality_fields', []))}")
print(f"   🔴 Low Quality Fields: {len(completeness.get('low_quality_fields', []))}")

print(f"\n⚙️ PROCESSING RECOMMENDATIONS:")
print(f"   📦 Recommended Batch Size: {processing_info.get('recommended_batch_size', 0):,}")
print(f"   💾 Memory per Batch: {processing_info.get('memory_per_batch_mb', 0):.1f} MB")
print(f"   ⏱️  Est. Processing Time: {processing_info.get('estimated_processing_time_hours', 0):.1f} hours")

print(f"\n📁 OUTPUT FILES GENERATED:")
print(f"   📋 Analysis Report: {insights_file}")
print(f"   🗃️  Database Schema: {schema_file}")

print(f"\n🚀 NEXT STEPS:")
print("   1. Review the generated analysis files")
print("   2. Use the database schema for your PostgreSQL setup")
print("   3. Apply the processing recommendations to your Java application")
print("   4. Use the identified high-quality fields for core functionality")

print("\n" + "="*80)


🔍 Generating Java Recommendations...
✅ Java Recommendations Generated:
   - Java Fields: 62
   - Recommended Batch Size: 5000

🔍 Generating Database Schema...
✅ Database Schema Generated

💾 Saving Analysis Results...
✅ Results Saved to: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output
   - Insights: linkedin_analysis_insights.json
   - Database Schema: optimized_schema.sql

🎉 LINKEDIN DATASET ANALYSIS COMPLETE

📊 DATASET OVERVIEW:
   📁 File Size: 15.15 GB
   📋 Total Rows: 51,352,619
   🗂️  Total Columns: 62
   🔍 Rows Analyzed: 200,000

🎯 DATA QUALITY SUMMARY:
   🟢 High Quality Fields: 12
   🟡 Medium Quality Fields: 7
   🔴 Low Quality Fields: 43

⚙️ PROCESSING RECOMMENDATIONS:
   📦 Recommended Batch Size: 5,000
   💾 Memory per Batch: 67.3 MB
   ⏱️  Est. Processing Time: 85.6 hours

📁 OUTPUT FILES GENERATED:
   📋 Analysis Report: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output/linkedin_analysis_insights.json
   🗃️ 

In [29]:
# ENHANCED ANALYSIS FOR SEMANTIC TALENT FINDER PROJECT
# =====================================================

print("\n🚀 STARTING ENHANCED ANALYSIS FOR SEMANTIC TALENT FINDER")
print("="*80)

def analyze_skills_data(self):
    """Deep analysis of skills data for semantic search optimization"""
    print("\n🎯 Analyzing Skills Data for Semantic Search...")
    
    skills_insights = {
        'skills_columns': [],
        'skills_patterns': {},
        'skills_standardization': {},
        'top_skills': {},
        'skills_combinations': {}
    }
    
    chunk_count = 0
    all_skills = []
    skills_by_column = {}
    
    try:
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        # Identify skills-related columns
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            
            if chunk_count == 0:
                # Find skills-related columns
                skills_columns = [col for col in chunk_df.columns 
                                if any(keyword in col.lower() for keyword in 
                                      ['skill', 'competenc', 'expertise', 'technolog'])]
                skills_insights['skills_columns'] = skills_columns
                print(f"   Found skills columns: {skills_columns}")
            
            # Analyze skills content
            for col in skills_insights['skills_columns'][:5]:  # Analyze top 5 skills columns
                if col not in skills_by_column:
                    skills_by_column[col] = []
                
                # Extract skills data
                skills_data = chunk_df[col].dropna()
                if len(skills_data) > 0:
                    # Sample skills for analysis
                    sample_skills = skills_data.iloc[:50].tolist()
                    skills_by_column[col].extend(sample_skills)
                    
                    # Parse individual skills (assuming comma-separated or similar)
                    for skill_entry in sample_skills:
                        if isinstance(skill_entry, str):
                            # Try different delimiters
                            for delimiter in [',', ';', '|', '\n']:
                                if delimiter in skill_entry:
                                    individual_skills = [s.strip() for s in skill_entry.split(delimiter)]
                                    all_skills.extend(individual_skills)
                                    break
                            else:
                                # No delimiter found, treat as single skill
                                all_skills.append(skill_entry.strip())
            
            chunk_count += 1
            del chunk_df
            gc.collect()
            
            if chunk_count >= 5:  # Limit for analysis
                break
        
        # Analyze skills patterns
        if all_skills:
            from collections import Counter
            skills_counter = Counter([skill.lower().strip() for skill in all_skills if skill.strip()])
            
            skills_insights['top_skills'] = dict(skills_counter.most_common(50))
            skills_insights['total_unique_skills'] = len(skills_counter)
            skills_insights['total_skill_mentions'] = sum(skills_counter.values())
            
            # Categorize skills
            tech_keywords = ['python', 'java', 'javascript', 'react', 'node', 'sql', 'aws', 'docker']
            soft_keywords = ['leadership', 'communication', 'management', 'teamwork', 'problem']
            
            tech_skills = [skill for skill in skills_counter.keys() 
                          if any(tech in skill for tech in tech_keywords)]
            soft_skills = [skill for skill in skills_counter.keys() 
                          if any(soft in skill for soft in soft_keywords)]
            
            skills_insights['skills_categories'] = {
                'technical_skills': tech_skills[:20],
                'soft_skills': soft_skills[:20],
                'tech_percentage': round(len(tech_skills) / len(skills_counter) * 100, 2),
                'soft_percentage': round(len(soft_skills) / len(skills_counter) * 100, 2)
            }
        
        self.insights['skills_analysis'] = skills_insights
        
        print(f"✅ Skills Analysis Complete:")
        print(f"   - Skills Columns: {len(skills_insights['skills_columns'])}")
        print(f"   - Unique Skills Found: {skills_insights.get('total_unique_skills', 0)}")
        print(f"   - Tech Skills: {len(skills_insights.get('skills_categories', {}).get('technical_skills', []))}")
        print(f"   - Soft Skills: {len(skills_insights.get('skills_categories', {}).get('soft_skills', []))}")
        
    except Exception as e:
        print(f"❌ Skills analysis failed: {e}")

# Add method to analyzer class
LinkedInDataAnalyzer.analyze_skills_data = analyze_skills_data

# Run skills analysis
analyzer.analyze_skills_data()


🚀 STARTING ENHANCED ANALYSIS FOR SEMANTIC TALENT FINDER

🎯 Analyzing Skills Data for Semantic Search...
   Found skills columns: ['Skills']
✅ Skills Analysis Complete:
   - Skills Columns: 1
   - Unique Skills Found: 1871
   - Tech Skills: 18
   - Soft Skills: 20


In [30]:
def analyze_text_content_for_embeddings(self):
    """Analyze text content to optimize embedding generation strategy"""
    print("\n📝 Analyzing Text Content for Embedding Strategy...")
    
    text_content_insights = {
        'text_fields': [],
        'content_quality': {},
        'embedding_strategy': {},
        'text_combinations': {}
    }
    
    chunk_count = 0
    text_samples = {}
    
    try:
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            
            if chunk_count == 0:
                # Identify text fields suitable for embeddings
                text_fields = []
                for col in chunk_df.columns:
                    col_lower = col.lower()
                    # High-value text fields for semantic search
                    if any(keyword in col_lower for keyword in 
                          ['summary', 'bio', 'description', 'about', 'headline', 
                           'experience', 'background', 'profile']):
                        text_fields.append(col)
                    # Job/role related fields
                    elif any(keyword in col_lower for keyword in 
                            ['title', 'role', 'position', 'job']):
                        text_fields.append(col)
                    # Industry/company fields
                    elif any(keyword in col_lower for keyword in 
                            ['industry', 'company', 'organization']):
                        text_fields.append(col)
                
                text_content_insights['text_fields'] = text_fields
                print(f"   Identified text fields for embeddings: {text_fields[:10]}...")
            
            # Sample text content
            for field in text_content_insights['text_fields'][:15]:
                if field not in text_samples:
                    text_samples[field] = []
                
                field_data = chunk_df[field].dropna()
                if len(field_data) > 0:
                    samples = field_data.iloc[:20].tolist()
                    text_samples[field].extend(samples)
            
            chunk_count += 1
            del chunk_df
            gc.collect()
            
            if chunk_count >= 3:
                break
        
        # Analyze text quality and patterns
        for field, samples in text_samples.items():
            if samples:
                # Calculate text statistics
                lengths = [len(str(sample)) for sample in samples if sample]
                if lengths:
                    text_content_insights['content_quality'][field] = {
                        'avg_length': round(np.mean(lengths), 2),
                        'max_length': max(lengths),
                        'min_length': min(lengths),
                        'sample_count': len(samples),
                        'sample_text': samples[0][:200] if samples[0] else "",
                        'information_density': round(np.mean(lengths) / 100, 2)  # chars per 100
                    }
        
        # Generate embedding strategy recommendations
        high_value_fields = []
        medium_value_fields = []
        
        for field, quality in text_content_insights['content_quality'].items():
            if quality['avg_length'] > 50 and quality['information_density'] > 0.5:
                high_value_fields.append(field)
            elif quality['avg_length'] > 20:
                medium_value_fields.append(field)
        
        text_content_insights['embedding_strategy'] = {
            'primary_text_fields': high_value_fields,
            'secondary_text_fields': medium_value_fields,
            'recommended_combination': ' | '.join(high_value_fields[:3]),
            'embedding_length_estimate': sum([
                text_content_insights['content_quality'].get(field, {}).get('avg_length', 0) 
                for field in high_value_fields[:3]
            ])
        }
        
        self.insights['text_content_analysis'] = text_content_insights
        
        print(f"✅ Text Content Analysis Complete:")
        print(f"   - Text Fields Identified: {len(text_content_insights['text_fields'])}")
        print(f"   - High-Value Fields: {len(high_value_fields)}")
        print(f"   - Recommended Primary Fields: {high_value_fields[:3]}")
        
    except Exception as e:
        print(f"❌ Text content analysis failed: {e}")

# Add method to analyzer
LinkedInDataAnalyzer.analyze_text_content_for_embeddings = analyze_text_content_for_embeddings

# Run text content analysis
analyzer.analyze_text_content_for_embeddings()


📝 Analyzing Text Content for Embedding Strategy...
   Identified text fields for embeddings: ['Industry', 'Job title', 'Sub Role', 'Industry 2', 'Company Name', 'Company Industry', 'Company Website', 'Company Size', 'Company Founded', 'Company Linkedin Url']...
✅ Text Content Analysis Complete:
   - Text Fields Identified: 25
   - High-Value Fields: 0
   - Recommended Primary Fields: []


In [31]:
def analyze_geographic_hierarchy(self):
    """Analyze geographic data structure and create location hierarchy"""
    print("\n🌍 Analyzing Geographic Data Structure...")
    
    geo_insights = {
        'location_fields': [],
        'location_hierarchy': {},
        'location_patterns': {},
        'geo_standardization': {}
    }
    
    chunk_count = 0
    location_data = {}
    
    try:
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            
            if chunk_count == 0:
                # Identify location fields
                location_fields = [col for col in chunk_df.columns 
                                 if any(keyword in col.lower() for keyword in 
                                       ['location', 'city', 'state', 'country', 'region', 
                                        'metro', 'locality', 'continent'])]
                geo_insights['location_fields'] = location_fields
                print(f"   Found location fields: {location_fields}")
            
            # Sample location data
            for field in geo_insights['location_fields']:
                if field not in location_data:
                    location_data[field] = []
                
                field_data = chunk_df[field].dropna()
                if len(field_data) > 0:
                    samples = field_data.iloc[:100].tolist()
                    location_data[field].extend(samples)
            
            chunk_count += 1
            del chunk_df
            gc.collect()
            
            if chunk_count >= 3:
                break
        
        # Analyze location patterns
        from collections import Counter
        
        for field, data in location_data.items():
            if data:
                # Count frequency of locations
                location_counter = Counter(data)
                geo_insights['location_patterns'][field] = {
                    'top_locations': dict(location_counter.most_common(20)),
                    'unique_count': len(location_counter),
                    'total_entries': len(data)
                }
                
                # Analyze location format patterns
                sample_locations = [str(loc) for loc in data[:20]]
                patterns = {
                    'comma_separated': sum(1 for loc in sample_locations if ',' in loc),
                    'has_state_codes': sum(1 for loc in sample_locations 
                                         if any(code in loc for code in [' CA', ' NY', ' TX', ' FL'])),
                    'has_country': sum(1 for loc in sample_locations 
                                     if any(country in loc.upper() for country in ['USA', 'US', 'UNITED STATES'])),
                    'avg_length': round(np.mean([len(loc) for loc in sample_locations]), 2)
                }
                geo_insights['location_patterns'][field]['format_patterns'] = patterns
        
        # Create location hierarchy mapping
        hierarchy_mapping = {}
        
        # Try to map hierarchy relationships
        for field in geo_insights['location_fields']:
            field_lower = field.lower()
            if 'continent' in field_lower:
                hierarchy_mapping['continent'] = field
            elif 'country' in field_lower:
                hierarchy_mapping['country'] = field
            elif 'region' in field_lower or 'state' in field_lower:
                hierarchy_mapping['region'] = field
            elif 'metro' in field_lower:
                hierarchy_mapping['metro'] = field
            elif 'locality' in field_lower or 'city' in field_lower:
                hierarchy_mapping['locality'] = field
            elif 'location' in field_lower and 'country' not in field_lower:
                hierarchy_mapping['primary_location'] = field
        
        geo_insights['location_hierarchy'] = hierarchy_mapping
        
        # Generate standardization recommendations
        geo_insights['geo_standardization'] = {
            'primary_location_field': hierarchy_mapping.get('primary_location', 'Location'),
            'hierarchy_fields': list(hierarchy_mapping.values()),
            'needs_parsing': any('comma_separated' in geo_insights['location_patterns'].get(field, {}).get('format_patterns', {}) 
                               and geo_insights['location_patterns'][field]['format_patterns']['comma_separated'] > 10
                               for field in geo_insights['location_fields']),
            'standardization_priority': sorted(geo_insights['location_fields'], 
                                             key=lambda x: geo_insights['location_patterns'].get(x, {}).get('unique_count', 0), 
                                             reverse=True)[:5]
        }
        
        self.insights['geographic_analysis'] = geo_insights
        
        print(f"✅ Geographic Analysis Complete:")
        print(f"   - Location Fields: {len(geo_insights['location_fields'])}")
        print(f"   - Hierarchy Levels: {len(hierarchy_mapping)}")
        print(f"   - Primary Location: {geo_insights['geo_standardization']['primary_location_field']}")
        
    except Exception as e:
        print(f"❌ Geographic analysis failed: {e}")

def analyze_professional_experience(self):
    """Analyze professional experience and career progression patterns"""
    print("\n💼 Analyzing Professional Experience Patterns...")
    
    experience_insights = {
        'experience_fields': [],
        'experience_levels': {},
        'career_patterns': {},
        'seniority_indicators': {}
    }
    
    chunk_count = 0
    experience_data = {}
    
    try:
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            
            if chunk_count == 0:
                # Identify experience-related fields
                experience_fields = []
                for col in chunk_df.columns:
                    col_lower = col.lower()
                    if any(keyword in col_lower for keyword in 
                          ['experience', 'year', 'senior', 'junior', 'level', 
                           'title', 'position', 'role']):
                        experience_fields.append(col)
                
                experience_insights['experience_fields'] = experience_fields
                print(f"   Found experience fields: {experience_fields[:10]}...")
            
            # Sample experience data
            for field in experience_insights['experience_fields'][:10]:
                if field not in experience_data:
                    experience_data[field] = []
                
                field_data = chunk_df[field].dropna()
                if len(field_data) > 0:
                    samples = field_data.iloc[:50].tolist()
                    experience_data[field].extend(samples)
            
            chunk_count += 1
            del chunk_df
            gc.collect()
            
            if chunk_count >= 3:
                break
        
        # Analyze seniority patterns in job titles
        title_fields = [field for field in experience_insights['experience_fields'] 
                       if 'title' in field.lower() or 'position' in field.lower()]
        
        seniority_keywords = {
            'entry': ['intern', 'junior', 'entry', 'associate', 'trainee'],
            'mid': ['mid', 'specialist', 'analyst', 'coordinator'],
            'senior': ['senior', 'lead', 'principal', 'staff'],
            'management': ['manager', 'director', 'head', 'chief', 'vp', 'vice president'],
            'executive': ['ceo', 'cto', 'cfo', 'president', 'founder']
        }
        
        seniority_analysis = {}
        for field in title_fields[:3]:  # Analyze top 3 title fields
            if field in experience_data:
                titles = [str(title).lower() for title in experience_data[field]]
                
                level_counts = {}
                for level, keywords in seniority_keywords.items():
                    count = sum(1 for title in titles 
                              if any(keyword in title for keyword in keywords))
                    level_counts[level] = count
                
                seniority_analysis[field] = level_counts
        
        experience_insights['seniority_indicators'] = seniority_analysis
        
        self.insights['experience_analysis'] = experience_insights
        
        print(f"✅ Experience Analysis Complete:")
        print(f"   - Experience Fields: {len(experience_insights['experience_fields'])}")
        print(f"   - Title Fields Analyzed: {len(title_fields)}")
        
    except Exception as e:
        print(f"❌ Experience analysis failed: {e}")

# Add methods to analyzer
LinkedInDataAnalyzer.analyze_geographic_hierarchy = analyze_geographic_hierarchy
LinkedInDataAnalyzer.analyze_professional_experience = analyze_professional_experience

# Run geographic and experience analysis
analyzer.analyze_geographic_hierarchy()
analyzer.analyze_professional_experience()


🌍 Analyzing Geographic Data Structure...
   Found location fields: ['Location', 'Locality', 'Metro', 'Region', 'Company Location Name', 'Company Location Locality', 'Company Location Metro', 'Company Location Region', 'Company Location Geo', 'Company Location Street Address', 'Company Location Address Line 2', 'Company Location Postal Code', 'Company Location Country', 'Company Location Continent', 'Location Country', 'Location Continent', 'Location Geo']
✅ Geographic Analysis Complete:
   - Location Fields: 17
   - Hierarchy Levels: 6
   - Primary Location: Location Geo

💼 Analyzing Professional Experience Patterns...
   Found experience fields: ['Job title', 'Sub Role', 'Birth Year', 'Years Experience']...
✅ Experience Analysis Complete:
   - Experience Fields: 4
   - Title Fields Analyzed: 1


In [32]:
def analyze_industry_company_data(self):
    """Deep dive into industry and company data standardization"""
    print("\n🏢 Analyzing Industry and Company Data...")
    
    industry_insights = {
        'industry_fields': [],
        'company_fields': [],
        'industry_standardization': {},
        'company_patterns': {},
        'business_intelligence': {}
    }
    
    chunk_count = 0
    industry_data = {}
    company_data = {}
    
    try:
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            
            if chunk_count == 0:
                # Identify industry and company fields
                industry_fields = [col for col in chunk_df.columns 
                                 if 'industry' in col.lower()]
                company_fields = [col for col in chunk_df.columns 
                                if 'company' in col.lower() or 'organization' in col.lower()]
                
                industry_insights['industry_fields'] = industry_fields
                industry_insights['company_fields'] = company_fields
                print(f"   Industry fields: {industry_fields}")
                print(f"   Company fields: {company_fields}")
            
            # Sample industry data
            for field in industry_insights['industry_fields']:
                if field not in industry_data:
                    industry_data[field] = []
                
                field_data = chunk_df[field].dropna()
                if len(field_data) > 0:
                    samples = field_data.iloc[:100].tolist()
                    industry_data[field].extend(samples)
            
            # Sample company data
            for field in industry_insights['company_fields']:
                if field not in company_data:
                    company_data[field] = []
                
                field_data = chunk_df[field].dropna()
                if len(field_data) > 0:
                    samples = field_data.iloc[:50].tolist()
                    company_data[field].extend(samples)
            
            chunk_count += 1
            del chunk_df
            gc.collect()
            
            if chunk_count >= 3:
                break
        
        # Analyze industry standardization needs
        from collections import Counter
        
        for field, data in industry_data.items():
            if data:
                industry_counter = Counter([str(ind).strip().title() for ind in data])
                
                # Group similar industries
                tech_industries = [ind for ind in industry_counter.keys() 
                                 if any(keyword in ind.lower() for keyword in 
                                       ['technology', 'software', 'tech', 'it', 'computer'])]
                finance_industries = [ind for ind in industry_counter.keys() 
                                    if any(keyword in ind.lower() for keyword in 
                                          ['finance', 'financial', 'bank', 'investment'])]
                healthcare_industries = [ind for ind in industry_counter.keys() 
                                       if any(keyword in ind.lower() for keyword in 
                                             ['health', 'medical', 'pharma', 'hospital'])]
                
                industry_insights['industry_standardization'][field] = {
                    'top_industries': dict(industry_counter.most_common(30)),
                    'total_unique': len(industry_counter),
                    'industry_clusters': {
                        'technology': tech_industries[:10],
                        'finance': finance_industries[:10],
                        'healthcare': healthcare_industries[:10]
                    },
                    'standardization_needed': len(industry_counter) > 1000
                }
        
        # Analyze company patterns
        for field, data in company_data.items():
            if data:
                company_counter = Counter([str(comp).strip() for comp in data])
                
                # Identify Fortune 500 indicators (common large companies)
                large_company_indicators = ['microsoft', 'google', 'amazon', 'apple', 
                                          'facebook', 'meta', 'netflix', 'tesla', 'ibm']
                large_companies = [comp for comp in company_counter.keys() 
                                 if any(indicator in comp.lower() for indicator in large_company_indicators)]
                
                industry_insights['company_patterns'][field] = {
                    'top_companies': dict(company_counter.most_common(20)),
                    'total_unique': len(company_counter),
                    'large_companies_found': large_companies[:10],
                    'avg_company_name_length': round(np.mean([len(comp) for comp in company_counter.keys()]), 2)
                }
        
        # Generate business intelligence insights
        industry_insights['business_intelligence'] = {
            'dominant_industries': [],
            'company_size_indicators': [],
            'market_insights': {}
        }
        
        # Find dominant industries across all industry fields
        all_industries = []
        for field_data in industry_data.values():
            all_industries.extend(field_data)
        
        if all_industries:
            all_industry_counter = Counter([str(ind).strip().title() for ind in all_industries])
            industry_insights['business_intelligence']['dominant_industries'] = dict(all_industry_counter.most_common(15))
        
        self.insights['industry_company_analysis'] = industry_insights
        
        print(f"✅ Industry & Company Analysis Complete:")
        print(f"   - Industry Fields: {len(industry_insights['industry_fields'])}")
        print(f"   - Company Fields: {len(industry_insights['company_fields'])}")
        
    except Exception as e:
        print(f"❌ Industry/company analysis failed: {e}")

# Add method to analyzer
LinkedInDataAnalyzer.analyze_industry_company_data = analyze_industry_company_data

# Run industry and company analysis
analyzer.analyze_industry_company_data()


🏢 Analyzing Industry and Company Data...
   Industry fields: ['Industry', 'Industry 2', 'Company Industry']
   Company fields: ['Company Name', 'Company Industry', 'Company Website', 'Company Size', 'Company Founded', 'Company Linkedin Url', 'Company Facebook Url', 'Company Twitter Url', 'Company Location Name', 'Company Location Locality', 'Company Location Metro', 'Company Location Region', 'Company Location Geo', 'Company Location Street Address', 'Company Location Address Line 2', 'Company Location Postal Code', 'Company Location Country', 'Company Location Continent']
✅ Industry & Company Analysis Complete:
   - Industry Fields: 3
   - Company Fields: 18


In [33]:
def generate_enhanced_database_schema(self):
    """Generate production-ready database schema optimized for semantic search"""
    print("\n🗄️ Generating Enhanced Database Schema for Semantic Talent Finder...")
    
    # Collect all analysis insights
    schema_analysis = self.insights.get('schema_analysis', {})
    quality_analysis = self.insights.get('data_quality', {})
    skills_analysis = self.insights.get('skills_analysis', {})
    text_analysis = self.insights.get('text_content_analysis', {})
    geo_analysis = self.insights.get('geographic_analysis', {})
    experience_analysis = self.insights.get('experience_analysis', {})
    industry_analysis = self.insights.get('industry_company_analysis', {})
    
    # Generate optimized CREATE TABLE statement
    create_table = "-- Semantic Talent Finder - Production Database Schema\\n"
    create_table += "-- Generated from LinkedIn dataset analysis\\n"
    create_table += "-- Optimized for vector similarity search and AI-powered matching\\n\\n"
    
    # Enable required extensions
    create_table += "-- Enable required PostgreSQL extensions\\n"
    create_table += "CREATE EXTENSION IF NOT EXISTS vector;\\n"
    create_table += "CREATE EXTENSION IF NOT EXISTS pg_trgm;\\n"
    create_table += "CREATE EXTENSION IF NOT EXISTS btree_gin;\\n\\n"
    
    # Main profiles table
    create_table += "-- Main profiles table with semantic search optimization\\n"
    create_table += "CREATE TABLE profiles (\\n"
    create_table += "    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),\\n"
    create_table += "    \\n"
    create_table += "    -- Core Identity Fields (High Quality - Low Null %)\\n"
    
    # Add high-quality core fields
    high_quality_fields = quality_analysis.get('completeness_summary', {}).get('high_quality_fields', [])
    
    core_identity_mapping = {
        'Full name': ('full_name', 'VARCHAR(500)', 'NOT NULL'),
        'First Name': ('first_name', 'VARCHAR(100)', 'NOT NULL'),
        'Last Name': ('last_name', 'VARCHAR(100)', 'NOT NULL'),
        'LinkedIn Url': ('linkedin_url', 'VARCHAR(500)', 'NOT NULL'),
        'LinkedIn Username': ('linkedin_username', 'VARCHAR(100)', 'NOT NULL')
    }
    
    for original_field, (db_field, data_type, constraint) in core_identity_mapping.items():
        if original_field in high_quality_fields:
            create_table += f"    {db_field} {data_type} {constraint},\\n"
    
    create_table += "    \\n"
    create_table += "    -- Professional Information\\n"
    
    # Professional fields based on text analysis
    text_fields = text_analysis.get('text_fields', [])
    professional_mapping = {
        'Job title': ('job_title', 'VARCHAR(300)'),
        'Industry': ('industry', 'VARCHAR(200)'),
        'Company Name': ('company_name', 'VARCHAR(300)'),
        'Company Industry': ('company_industry', 'VARCHAR(200)')
    }
    
    for original_field, (db_field, data_type) in professional_mapping.items():
        null_pct = quality_analysis.get('null_percentages', {}).get(original_field, 100)
        constraint = ' NOT NULL' if null_pct < 1 else ''
        create_table += f"    {db_field} {data_type}{constraint},\\n"
    
    create_table += "    \\n"
    create_table += "    -- Geographic Information (Hierarchical)\\n"
    
    # Geographic fields based on hierarchy analysis
    geo_hierarchy = geo_analysis.get('location_hierarchy', {})
    geographic_mapping = [
        ('location', 'VARCHAR(500)', 'Primary location string'),
        ('locality', 'VARCHAR(200)', 'City/locality'),
        ('region', 'VARCHAR(200)', 'State/region'),  
        ('country', 'VARCHAR(100)', 'Country'),
        ('continent', 'VARCHAR(50)', 'Continent'),
        ('metro_area', 'VARCHAR(200)', 'Metropolitan area')
    ]
    
    for db_field, data_type, comment in geographic_mapping:
        original_field = next((k for k, v in geo_hierarchy.items() if v.lower().replace(' ', '_') == db_field), None)
        if original_field or db_field == 'location':
            null_pct = quality_analysis.get('null_percentages', {}).get('Location', 100)
            constraint = ' NOT NULL' if null_pct < 1 and db_field == 'location' else ''
            create_table += f"    {db_field} {data_type}{constraint}, -- {comment}\\n"
    
    create_table += "    \\n"
    create_table += "    -- Skills and Expertise (Optimized for Search)\\n"
    
    # Skills based on skills analysis
    skills_columns = skills_analysis.get('skills_columns', [])
    if skills_columns:
        create_table += "    skills TEXT[], -- Array of skills for efficient querying\\n"
        create_table += "    skills_text TEXT, -- Concatenated skills for full-text search\\n"
        create_table += "    technical_skills TEXT[], -- Technical skills subset\\n"
        create_table += "    soft_skills TEXT[], -- Soft skills subset\\n"
    
    create_table += "    \\n"
    create_table += "    -- Experience and Seniority\\n"
    create_table += "    years_experience INTEGER, -- Calculated years of experience\\n"
    create_table += "    experience_level VARCHAR(50), -- entry, mid, senior, management, executive\\n"
    create_table += "    seniority_score INTEGER, -- Computed seniority score (0-100)\\n"
    
    create_table += "    \\n"
    create_table += "    -- Computed Text Fields for Semantic Search\\n"
    
    # Text fields optimized for embeddings based on text analysis
    primary_fields = text_analysis.get('embedding_strategy', {}).get('primary_text_fields', [])
    create_table += "    headline VARCHAR(500), -- Professional headline/summary\\n"
    create_table += "    professional_summary TEXT, -- Combined professional description\\n"
    create_table += "    searchable_content TEXT, -- Optimized content for embedding generation\\n"
    
    create_table += "    \\n"
    create_table += "    -- Contact Information (Optional)\\n"
    create_table += "    email VARCHAR(320), -- RFC 5322 compliant length\\n"
    create_table += "    phone VARCHAR(50),\\n"
    create_table += "    mobile VARCHAR(50),\\n"
    
    create_table += "    \\n"
    create_table += "    -- Social Media\\n"
    create_table += "    facebook_url VARCHAR(500),\\n"
    create_table += "    twitter_url VARCHAR(500),\\n"
    
    create_table += "    \\n"
    create_table += "    -- Company Details\\n"
    create_table += "    company_website VARCHAR(500),\\n"
    create_table += "    company_size VARCHAR(100),\\n"
    create_table += "    company_founded VARCHAR(20),\\n"
    
    create_table += "    \\n"
    create_table += "    -- AI/ML Fields\\n"
    create_table += "    embedding vector(1536) NOT NULL, -- OpenAI text-embedding-3-small\\n"
    create_table += "    embedding_version VARCHAR(20) DEFAULT 'v1.0', -- Track embedding model version\\n"
    create_table += "    content_hash VARCHAR(64), -- SHA-256 hash for detecting changes\\n"
    
    create_table += "    \\n"
    create_table += "    -- Metadata\\n"
    create_table += "    data_source VARCHAR(100) DEFAULT 'linkedin',\\n"
    create_table += "    data_quality_score DECIMAL(3,2), -- 0.00 to 1.00\\n"
    create_table += "    last_profile_update TIMESTAMP,\\n"
    create_table += "    created_at TIMESTAMPTZ DEFAULT NOW(),\\n"
    create_table += "    updated_at TIMESTAMPTZ DEFAULT NOW()\\n"
    create_table += ");\\n\\n"
    
    # Generate optimized indexes
    indexes = []
    indexes.append("-- Vector Similarity Index (HNSW for fast approximate search)")
    indexes.append("CREATE INDEX profiles_embedding_hnsw_idx ON profiles USING hnsw (embedding vector_cosine_ops)")
    indexes.append("    WITH (m = 16, ef_construction = 64);")
    indexes.append("")
    
    indexes.append("-- Core Identity Indexes")
    indexes.append("CREATE UNIQUE INDEX profiles_linkedin_username_idx ON profiles(linkedin_username);")
    indexes.append("CREATE INDEX profiles_full_name_idx ON profiles(full_name);") 
    indexes.append("CREATE INDEX profiles_name_trgm_idx ON profiles USING gin (full_name gin_trgm_ops);")
    indexes.append("")
    
    indexes.append("-- Professional Search Indexes")
    indexes.append("CREATE INDEX profiles_job_title_idx ON profiles(job_title);")
    indexes.append("CREATE INDEX profiles_industry_idx ON profiles(industry);")
    indexes.append("CREATE INDEX profiles_company_idx ON profiles(company_name);")
    indexes.append("CREATE INDEX profiles_experience_level_idx ON profiles(experience_level);")
    indexes.append("")
    
    indexes.append("-- Geographic Search Indexes")
    indexes.append("CREATE INDEX profiles_location_idx ON profiles(location);")
    indexes.append("CREATE INDEX profiles_region_idx ON profiles(region);")
    indexes.append("CREATE INDEX profiles_country_idx ON profiles(country);")
    indexes.append("CREATE INDEX profiles_geo_hierarchy_idx ON profiles(country, region, locality);")
    indexes.append("")
    
    indexes.append("-- Skills Search Indexes")
    if skills_columns:
        indexes.append("CREATE INDEX profiles_skills_gin_idx ON profiles USING gin (skills);")
        indexes.append("CREATE INDEX profiles_technical_skills_gin_idx ON profiles USING gin (technical_skills);")
        indexes.append("CREATE INDEX profiles_skills_text_trgm_idx ON profiles USING gin (skills_text gin_trgm_ops);")
        indexes.append("")
    
    indexes.append("-- Full-Text Search Index")
    indexes.append("CREATE INDEX profiles_searchable_content_fts_idx ON profiles USING gin (to_tsvector('english', searchable_content));")
    indexes.append("")
    
    indexes.append("-- Performance Indexes")
    indexes.append("CREATE INDEX profiles_quality_score_idx ON profiles(data_quality_score) WHERE data_quality_score >= 0.7;")
    indexes.append("CREATE INDEX profiles_updated_at_idx ON profiles(updated_at);")
    indexes.append("CREATE INDEX profiles_compound_search_idx ON profiles(industry, experience_level, country) WHERE data_quality_score >= 0.7;")
    
    # Additional tables for semantic search optimization
    additional_tables = []
    additional_tables.append("\\n-- Skills lookup table for standardization")
    additional_tables.append("CREATE TABLE skills_dictionary (")
    additional_tables.append("    id SERIAL PRIMARY KEY,")
    additional_tables.append("    skill_name VARCHAR(200) NOT NULL UNIQUE,")
    additional_tables.append("    skill_category VARCHAR(100), -- technical, soft, industry")
    additional_tables.append("    skill_type VARCHAR(50), -- programming, framework, tool, etc.")
    additional_tables.append("    synonyms TEXT[], -- Alternative names")
    additional_tables.append("    popularity_score INTEGER DEFAULT 0,")
    additional_tables.append("    created_at TIMESTAMPTZ DEFAULT NOW()")
    additional_tables.append(");")
    additional_tables.append("")
    additional_tables.append("CREATE INDEX skills_dictionary_name_idx ON skills_dictionary(skill_name);")
    additional_tables.append("CREATE INDEX skills_dictionary_category_idx ON skills_dictionary(skill_category);")
    additional_tables.append("")
    
    additional_tables.append("-- Search analytics table")
    additional_tables.append("CREATE TABLE search_queries (")
    additional_tables.append("    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),")
    additional_tables.append("    query_text TEXT NOT NULL,")
    additional_tables.append("    query_embedding vector(1536),")
    additional_tables.append("    user_session VARCHAR(100),")
    additional_tables.append("    results_count INTEGER,")
    additional_tables.append("    execution_time_ms INTEGER,")
    additional_tables.append("    filters_applied JSONB,")
    additional_tables.append("    created_at TIMESTAMPTZ DEFAULT NOW()")
    additional_tables.append(");")
    additional_tables.append("")
    additional_tables.append("CREATE INDEX search_queries_created_at_idx ON search_queries(created_at);")
    additional_tables.append("CREATE INDEX search_queries_embedding_idx ON search_queries USING hnsw (query_embedding vector_cosine_ops);")
    
    # Combine all schema elements
    complete_schema = create_table + "\\n".join(indexes) + "\\n" + "\\n".join(additional_tables)
    
    # Generate data processing recommendations
    processing_recommendations = {
        'embedding_generation': {
            'primary_fields': text_analysis.get('embedding_strategy', {}).get('primary_text_fields', []),
            'content_template': 'Professional: {job_title} at {company_name} | Industry: {industry} | Location: {location} | Skills: {skills_text}',
            'max_content_length': 8000,  # OpenAI limit consideration
            'preprocessing_steps': [
                'Remove HTML tags and special characters',
                'Normalize whitespace',
                'Handle null values with defaults',
                'Truncate if exceeds max length'
            ]
        },
        'data_quality_rules': {
            'minimum_quality_score': 0.7,
            'required_fields': ['full_name', 'linkedin_username'],
            'embedding_generation_threshold': 0.5,
            'content_validation': [
                'full_name must not be empty',
                'linkedin_username must be unique',
                'at least one of job_title or industry must be present'
            ]
        },
        'batch_processing': {
            'recommended_batch_size': 5000,
            'parallel_embedding_batch_size': 100,
            'error_handling': 'skip_and_log',
            'retry_logic': 'exponential_backoff'
        }
    }
    
    self.insights['enhanced_database_schema'] = {
        'create_table_sql': complete_schema,
        'processing_recommendations': processing_recommendations,
        'schema_optimizations': {
            'vector_index_type': 'HNSW',
            'text_search_method': 'pg_trgm + GIN',
            'skills_storage': 'TEXT[] arrays',
            'geographic_hierarchy': 'normalized fields',
            'full_text_search': 'tsvector with english config'
        }
    }
    
    print(f"✅ Enhanced Database Schema Generated:")
    print(f"   - Optimized for semantic search")
    print(f"   - {len([i for i in indexes if 'CREATE INDEX' in i])} specialized indexes")
    print(f"   - Skills standardization support")
    print(f"   - Geographic hierarchy optimization")
    print(f"   - Full-text search integration")

def save_enhanced_insights(self):
    """Save enhanced analysis insights and schema"""
    print("\\n💾 Saving Enhanced Analysis Results...")
    
    # Save comprehensive JSON insights
    insights_file = os.path.join(OUTPUT_DIR, 'enhanced_linkedin_analysis.json')
    with open(insights_file, 'w') as f:
        json.dump(self.insights, f, indent=2, default=str)
    
    # Save production-ready schema
    schema_file = os.path.join(OUTPUT_DIR, 'semantic_talent_finder_schema.sql')
    with open(schema_file, 'w') as f:
        schema_sql = self.insights.get('enhanced_database_schema', {}).get('create_table_sql', '')
        f.write(schema_sql)
    
    # Save data processing guide
    processing_file = os.path.join(OUTPUT_DIR, 'data_processing_guide.json')
    processing_recommendations = self.insights.get('enhanced_database_schema', {}).get('processing_recommendations', {})
    with open(processing_file, 'w') as f:
        json.dump(processing_recommendations, f, indent=2)
    
    print(f"✅ Enhanced Results Saved:")
    print(f"   - Complete Analysis: enhanced_linkedin_analysis.json")
    print(f"   - Production Schema: semantic_talent_finder_schema.sql")
    print(f"   - Processing Guide: data_processing_guide.json")
    
    return insights_file, schema_file, processing_file

# Add methods to analyzer
LinkedInDataAnalyzer.generate_enhanced_database_schema = generate_enhanced_database_schema
LinkedInDataAnalyzer.save_enhanced_insights = save_enhanced_insights

# Generate enhanced schema and save results
analyzer.generate_enhanced_database_schema()
insights_file, schema_file, processing_file = analyzer.save_enhanced_insights()


🗄️ Generating Enhanced Database Schema for Semantic Talent Finder...
✅ Enhanced Database Schema Generated:
   - Optimized for semantic search
   - 18 specialized indexes
   - Skills standardization support
   - Geographic hierarchy optimization
   - Full-text search integration
\n💾 Saving Enhanced Analysis Results...
✅ Enhanced Results Saved:
   - Complete Analysis: enhanced_linkedin_analysis.json
   - Production Schema: semantic_talent_finder_schema.sql
   - Processing Guide: data_processing_guide.json


In [34]:
# FINAL ENHANCED ANALYSIS SUMMARY
print("\n" + "="*80)
print("🎉 ENHANCED LINKEDIN DATASET ANALYSIS COMPLETE")
print("="*80)

# Display comprehensive summary
schema_info = analyzer.insights['schema_analysis']
quality_info = analyzer.insights['data_quality']
skills_info = analyzer.insights.get('skills_analysis', {})
text_info = analyzer.insights.get('text_content_analysis', {})
geo_info = analyzer.insights.get('geographic_analysis', {})
experience_info = analyzer.insights.get('experience_analysis', {})
industry_info = analyzer.insights.get('industry_company_analysis', {})

print(f"\n📊 COMPREHENSIVE DATASET ANALYSIS:")
print(f"   📁 File Size: {schema_info.get('file_size_gb', 0)} GB")
print(f"   📋 Total Rows: {schema_info.get('total_rows', 0):,}")
print(f"   🗂️  Total Columns: {schema_info.get('total_columns', 0)}")
print(f"   🔍 Sample Analyzed: {quality_info.get('total_rows_analyzed', 0):,}")

print(f"\n🎯 DATA QUALITY INSIGHTS:")
completeness = quality_info.get('completeness_summary', {})
print(f"   🟢 High Quality Fields: {len(completeness.get('high_quality_fields', []))}")
print(f"   🟡 Medium Quality Fields: {len(completeness.get('medium_quality_fields', []))}")
print(f"   🔴 Low Quality Fields: {len(completeness.get('low_quality_fields', []))}")

print(f"\n🎯 SKILLS ANALYSIS:")
print(f"   📚 Skills Columns Found: {len(skills_info.get('skills_columns', []))}")
print(f"   🔧 Unique Skills: {skills_info.get('total_unique_skills', 0):,}")
print(f"   💻 Technical Skills: {len(skills_info.get('skills_categories', {}).get('technical_skills', []))}")
print(f"   🤝 Soft Skills: {len(skills_info.get('skills_categories', {}).get('soft_skills', []))}")

print(f"\n📝 TEXT CONTENT ANALYSIS:")
print(f"   📄 Text Fields for Embeddings: {len(text_info.get('text_fields', []))}")
print(f"   ⭐ High-Value Content Fields: {len(text_info.get('embedding_strategy', {}).get('primary_text_fields', []))}")
print(f"   📊 Content Quality Fields: {len(text_info.get('content_quality', {}))}")

print(f"\n🌍 GEOGRAPHIC ANALYSIS:")
print(f"   🗺️  Location Fields: {len(geo_info.get('location_fields', []))}")
print(f"   🏛️  Hierarchy Levels: {len(geo_info.get('location_hierarchy', {}))}")
print(f"   📍 Primary Location Field: {geo_info.get('geo_standardization', {}).get('primary_location_field', 'N/A')}")

print(f"\n💼 PROFESSIONAL EXPERIENCE:")
print(f"   💻 Experience Fields: {len(experience_info.get('experience_fields', []))}")
print(f"   🏢 Seniority Analysis: {len(experience_info.get('seniority_indicators', {}))}")

print(f"\n🏭 INDUSTRY & COMPANY ANALYSIS:")
print(f"   🏢 Industry Fields: {len(industry_info.get('industry_fields', []))}")
print(f"   🏛️  Company Fields: {len(industry_info.get('company_fields', []))}")
print(f"   📊 Top Industries: {len(industry_info.get('business_intelligence', {}).get('dominant_industries', {}))}")

print(f"\n🗄️ ENHANCED DATABASE SCHEMA:")
enhanced_schema = analyzer.insights.get('enhanced_database_schema', {})
print(f"   🏗️  Production-Ready Schema: ✅")
print(f"   🔍 Vector Similarity Search: ✅")
print(f"   📊 Full-Text Search Integration: ✅")
print(f"   🎯 Skills Standardization: ✅")
print(f"   🌍 Geographic Hierarchy: ✅")

print(f"\n📁 ENHANCED OUTPUT FILES:")
print(f"   📋 Complete Analysis: {insights_file}")
print(f"   🗃️  Production Schema: {schema_file}")
print(f"   📖 Processing Guide: {processing_file}")

print(f"\n🚀 SEMANTIC TALENT FINDER READINESS:")
print("   ✅ Database schema optimized for 50M+ profiles")
print("   ✅ Vector embeddings support with HNSW indexing")
print("   ✅ Multi-dimensional search capabilities")
print("   ✅ Skills-based semantic matching")
print("   ✅ Geographic and experience filtering")
print("   ✅ Full-text search integration")
print("   ✅ Data quality scoring and validation")

print(f"\n🎯 RECOMMENDED NEXT STEPS:")
print("   1. Review enhanced_linkedin_analysis.json for detailed insights")
print("   2. Use semantic_talent_finder_schema.sql for database setup")
print("   3. Follow data_processing_guide.json for data ingestion")
print("   4. Implement embedding generation for searchable_content field")
print("   5. Set up vector similarity search endpoints")
print("   6. Configure skills standardization pipeline")

print("\n" + "="*80)
print("🎉 ANALYSIS COMPLETE - READY FOR SEMANTIC TALENT FINDER IMPLEMENTATION!")
print("="*80)


🎉 ENHANCED LINKEDIN DATASET ANALYSIS COMPLETE

📊 COMPREHENSIVE DATASET ANALYSIS:
   📁 File Size: 15.15 GB
   📋 Total Rows: 51,352,619
   🗂️  Total Columns: 62
   🔍 Sample Analyzed: 200,000

🎯 DATA QUALITY INSIGHTS:
   🟢 High Quality Fields: 12
   🟡 Medium Quality Fields: 7
   🔴 Low Quality Fields: 43

🎯 SKILLS ANALYSIS:
   📚 Skills Columns Found: 1
   🔧 Unique Skills: 1,871
   💻 Technical Skills: 18
   🤝 Soft Skills: 20

📝 TEXT CONTENT ANALYSIS:
   📄 Text Fields for Embeddings: 25
   ⭐ High-Value Content Fields: 0
   📊 Content Quality Fields: 15

🌍 GEOGRAPHIC ANALYSIS:
   🗺️  Location Fields: 17
   🏛️  Hierarchy Levels: 6
   📍 Primary Location Field: Location Geo

💼 PROFESSIONAL EXPERIENCE:
   💻 Experience Fields: 4
   🏢 Seniority Analysis: 1

🏭 INDUSTRY & COMPANY ANALYSIS:
   🏢 Industry Fields: 3
   🏛️  Company Fields: 18
   📊 Top Industries: 15

🗄️ ENHANCED DATABASE SCHEMA:
   🏗️  Production-Ready Schema: ✅
   🔍 Vector Similarity Search: ✅
   📊 Full-Text Search Integration: ✅
   🎯 S