In [1]:
"""
LinkedIn Parquet Dataset Analysis Script
Analyzes 15.2GB parquet file with 20M rows for Semantic Talent Finder project

This script processes large parquet files in chunks to:
1. Extract schema and data type information
2. Analyze data quality and completeness
3. Generate insights for Java model optimization
4. Provide database schema recommendations
5. Configure processing pipeline parameters
"""

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import json
import os
from collections import Counter, defaultdict
from datetime import datetime
import gc
import psutil

# Configuration
PARQUET_FILE = "/Users/chromatrical/CAREER/Local Linkedin DB/DataBase/USA_filtered.parquet"
CHUNK_SIZE = 50000  # Process 50k rows at a time to manage memory
OUTPUT_DIR = "/Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output"

print("🚀 LinkedIn Parquet Dataset Analysis - Starting Setup...")
print(f"📁 Target File: {PARQUET_FILE}")
print(f"⭐ Chunk Size: {CHUNK_SIZE:,} rows")
print(f"💾 Output Directory: {OUTPUT_DIR}")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"✅ Setup Complete - Output directory created at {OUTPUT_DIR}")

🚀 LinkedIn Parquet Dataset Analysis - Starting Setup...
📁 Target File: /Users/chromatrical/CAREER/Local Linkedin DB/DataBase/USA_filtered.parquet
⭐ Chunk Size: 50,000 rows
💾 Output Directory: /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output
✅ Setup Complete - Output directory created at /Users/chromatrical/CAREER/Side Projects/semantic-talent-finder/data/analysis_output


In [2]:
class LinkedInDataAnalyzer:
    def __init__(self, parquet_file_path, chunk_size=50000):
        self.parquet_file = parquet_file_path
        self.chunk_size = chunk_size
        self.insights = {
            'schema_analysis': {},
            'data_quality': {},
            'content_analysis': {},
            'business_logic': {},
            'processing_recommendations': {},
            'database_schema': {}
        }
        
        # Ensure output directory exists
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        
    def get_memory_usage(self):
        """Monitor memory usage during processing"""
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 / 1024  # MB
    
    def analyze_parquet_schema(self):
        """Analyze parquet file schema and metadata"""
        print("🔍 Analyzing Parquet Schema...")
        
        try:
            # Read parquet metadata without loading data
            parquet_file = pq.ParquetFile(self.parquet_file)
            schema = parquet_file.schema_arrow
            metadata = parquet_file.metadata
            
            # Extract schema information
            schema_info = {}
            for i, field in enumerate(schema):
                schema_info[field.name] = {
                    'type': str(field.type),
                    'nullable': field.nullable,
                    'index': i
                }
            
            self.insights['schema_analysis'] = {
                'total_columns': len(schema),
                'total_rows': metadata.num_rows,
                'file_size_gb': round(os.path.getsize(self.parquet_file) / (1024**3), 2),
                'columns': schema_info,
                'column_names': [field.name for field in schema]
            }
            
            print(f"✅ Schema Analysis Complete:")
            print(f"   - Total Rows: {metadata.num_rows:,}")
            print(f"   - Total Columns: {len(schema)}")
            print(f"   - File Size: {self.insights['schema_analysis']['file_size_gb']} GB")
            print(f"   - Columns: {', '.join(list(schema_info.keys())[:10])}...")
            
        except Exception as e:
            print(f"❌ Schema analysis failed: {e}")

# Initialize analyzer
analyzer = LinkedInDataAnalyzer(PARQUET_FILE, CHUNK_SIZE)
print("📊 LinkedInDataAnalyzer initialized successfully")

📊 LinkedInDataAnalyzer initialized successfully


In [3]:
# Run schema analysis
analyzer.analyze_parquet_schema()

# Display schema results
print("\n📋 Schema Summary:")
schema_info = analyzer.insights['schema_analysis']
print(f"Total Rows: {schema_info.get('total_rows', 0):,}")
print(f"Total Columns: {schema_info.get('total_columns', 0)}")
print(f"File Size: {schema_info.get('file_size_gb', 0)} GB")

print("\n🔍 Column Overview:")
columns = schema_info.get('columns', {})
for i, (col_name, col_info) in enumerate(list(columns.items())[:15]):  # Show first 15 columns
    nullable = "nullable" if col_info.get('nullable', True) else "not null"
    print(f"  {i+1:2d}. {col_name:<30} | {col_info.get('type', 'unknown'):<15} | {nullable}")

if len(columns) > 15:
    print(f"  ... and {len(columns) - 15} more columns")

🔍 Analyzing Parquet Schema...
✅ Schema Analysis Complete:
   - Total Rows: 51,352,619
   - Total Columns: 62
   - File Size: 15.15 GB
   - Columns: Full name, Industry, Job title, Sub Role, Industry 2, Emails, Mobile, Phone numbers, Company Name, Company Industry...

📋 Schema Summary:
Total Rows: 51,352,619
Total Columns: 62
File Size: 15.15 GB

🔍 Column Overview:
   1. Full name                      | string          | nullable
   2. Industry                       | string          | nullable
   3. Job title                      | string          | nullable
   4. Sub Role                       | string          | nullable
   5. Industry 2                     | string          | nullable
   6. Emails                         | string          | nullable
   7. Mobile                         | string          | nullable
   8. Phone numbers                  | string          | nullable
   9. Company Name                   | string          | nullable
  10. Company Industry               | 

In [4]:
def analyze_data_quality_chunked(self):
    """Analyze data quality in chunks to handle large file"""
    print("\n🔍 Analyzing Data Quality in Chunks...")
    
    # Initialize aggregators
    null_counts = defaultdict(int)
    total_counts = defaultdict(int)
    data_types = {}
    
    chunk_count = 0
    total_rows_processed = 0
    
    try:
        # Process file in chunks
        parquet_file = pq.ParquetFile(self.parquet_file)
        
        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
            chunk_df = batch.to_pandas()
            chunk_count += 1
            total_rows_processed += len(chunk_df)
            
            # Analyze each column
            for column in chunk_df.columns:
                # Count nulls
                null_counts[column] += chunk_df[column].isnull().sum()
                total_counts[column] += len(chunk_df)
                
                # Store data type
                if column not in data_types:
                    data_types[column] = str(chunk_df[column].dtype)
            
            # Memory management
            del chunk_df
            gc.collect()
            
            if chunk_count % 50 == 0:
                print(f"   Processed {chunk_count} chunks ({total_rows_processed:,} rows)")
                print(f"   Memory usage: {self.get_memory_usage():.2f} MB")
            
            # Limit analysis for demo - analyze first 200k rows
            if chunk_count >= 4:
                break
        
        # Calculate null percentages
        null_percentages = {}
        for column in null_counts:
            null_percentages[column] = round((null_counts[column] / total_counts[column]) * 100, 2)
        
        self.insights['data_quality'] = {
            'total_rows_analyzed': total_rows_processed,
            'chunks_processed': chunk_count,
            'null_counts': dict(null_counts),
            'null_percentages': null_percentages,
            'data_types': data_types,
            'completeness_summary': {
                'high_quality_fields': [col for col, pct in null_percentages.items() if pct < 5],
                'medium_quality_fields': [col for col, pct in null_percentages.items() if 5 <= pct < 25],
                'low_quality_fields': [col for col, pct in null_percentages.items() if pct >= 25]
            }
        }
        
        print(f"✅ Data Quality Analysis Complete:")
        print(f"   - Rows Analyzed: {total_rows_processed:,}")
        print(f"   - High Quality Fields: {len(self.insights['data_quality']['completeness_summary']['high_quality_fields'])}")
        print(f"   - Low Quality Fields: {len(self.insights['data_quality']['completeness_summary']['low_quality_fields'])}")
        
    except Exception as e:
        print(f"❌ Data quality analysis failed: {e}")

# Add method to analyzer class
LinkedInDataAnalyzer.analyze_data_quality_chunked = analyze_data_quality_chunked

# Run data quality analysis
analyzer.analyze_data_quality_chunked()


🔍 Analyzing Data Quality in Chunks...
✅ Data Quality Analysis Complete:
   - Rows Analyzed: 200,000
   - High Quality Fields: 12
   - Low Quality Fields: 43
