In [2]:
from web3 import Web3
import pandas as pd
import numpy as np
import time
import os

# Step 1: Retrieve your Milestone 1 output to start the retrieval process
print("📥 Starting Week 6: Data Retrieval and Processing")
print("🔗 Connecting to blockchain...")

# Connect to Ganache blockchain
ganache_url = "http://127.0.0.1:7545"
web3 = Web3(Web3.HTTPProvider(ganache_url))

if web3.is_connected():
    print("✅ Connected to Ganache successfully!")
else:
    print("❌ Connection failed. Make sure Ganache is running.")
    exit()

# Contract details (from your Week 5 deployment)
contract_address = web3.to_checksum_address("0x2df65fed5311cc0349b4967d5fefd91e39a8866e")
contract_abi = [
    # Your contract ABI (same as Week 5)
    {
        "inputs": [{"internalType": "address","name": "device","type": "address"}],
        "name": "authorizeDevice","outputs": [],"stateMutability": "nonpayable","type": "function"
    },
    {
        "inputs": [{"internalType": "address","name": "device","type": "address"}],
        "name": "revokeDevice","outputs": [],"stateMutability": "nonpayable","type": "function"
    },
    {
        "inputs": [
            {"internalType": "string","name": "deviceID","type": "string"},
            {"internalType": "string","name": "dataType","type": "string"},
            {"internalType": "string","name": "value","type": "string"}
        ],
        "name": "storeData","outputs": [],"stateMutability": "nonpayable","type": "function"
    },
    {
        "inputs": [],"stateMutability": "nonpayable","type": "constructor"
    },
    {
        "inputs": [{"internalType": "address","name": "","type": "address"}],
        "name": "authorizedDevices","outputs": [{"internalType": "bool","name": "","type": "bool"}],
        "stateMutability": "view","type": "function"
    },
    {
        "inputs": [
            {"internalType": "string","name": "deviceID","type": "string"},
            {"internalType": "uint256","name": "index","type": "uint256"}
        ],
        "name": "getDataByIndex",
        "outputs": [
            {"internalType": "string","name": "","type": "string"},
            {"internalType": "string","name": "","type": "string"},
            {"internalType": "uint256","name": "","type": "uint256"}
        ],
        "stateMutability": "view","type": "function"
    },
    {
        "inputs": [{"internalType": "string","name": "deviceID","type": "string"}],
        "name": "getDataCount",
        "outputs": [{"internalType": "uint256","name": "","type": "uint256"}],
        "stateMutability": "view","type": "function"
    },
    {
        "inputs": [],"name": "owner",
        "outputs": [{"internalType": "address","name": "","type": "address"}],
        "stateMutability": "view","type": "function"
    }
]

# Load the smart contract
contract = web3.eth.contract(address=contract_address, abi=contract_abi)
web3.eth.default_account = web3.eth.accounts[0]
print("✅ Connected to Smart Contract")

# Step 1.5: Find and load CSV file
print("\n📁 Looking for logistics data file...")

# List of possible CSV file names
possible_files = [
    "LogisticsData.csv",
    "Logistics-Data.csv", 
    "logistics_data.csv",
    "logistics-data.csv",
    "IoT_Data.csv",
    "iot_data.csv"
]

df_original = None
csv_filename = None

# Check current directory for CSV files
current_files = [f for f in os.listdir('.') if f.endswith('.csv')]
print(f"📋 CSV files found in current directory: {current_files}")

# Try to find the logistics data file
for filename in possible_files:
    if os.path.exists(filename):
        try:
            df_original = pd.read_csv(filename)
            csv_filename = filename
            print(f"✅ Found and loaded: {filename}")
            print(f"📊 Data shape: {df_original.shape}")
            print(f"📋 Columns: {list(df_original.columns)}")
            break
        except Exception as e:
            print(f"❌ Error loading {filename}: {e}")

# If still no file found, try the first CSV file in directory
if df_original is None and current_files:
    try:
        csv_filename = current_files[0]
        df_original = pd.read_csv(csv_filename)
        print(f"✅ Using first available CSV: {csv_filename}")
        print(f"📊 Data shape: {df_original.shape}")
        print(f"📋 Columns: {list(df_original.columns)}")
    except Exception as e:
        print(f"❌ Error loading {csv_filename}: {e}")

if df_original is None:
    print("❌ No logistics data file found. Please ensure you have LogisticsData.csv in your directory.")
    exit()

print(f"📄 Using data file: {csv_filename}")
print("📋 First few rows:")
print(df_original.head())

# Verify required columns exist
required_columns = ['package_id', 'timestamp', 'temperature', 'humidity', 'shock']
missing_columns = [col for col in required_columns if col not in df_original.columns]

if missing_columns:
    print(f"❌ Missing required columns: {missing_columns}")
    print("📋 Available columns:", list(df_original.columns))
    exit()

print("✅ All required columns found!")

# Step 2: Check blockchain data and determine what to store
print("\n📊 Analyzing data requirements...")

device_ids = df_original["package_id"].unique()
total_csv_records = len(df_original)
expected_blockchain_records = total_csv_records * 3  # 3 sensors per package

print(f"📦 Total packages in CSV: {len(device_ids)}")
print(f"📊 Total CSV records: {total_csv_records}")
print(f"🎯 Expected blockchain records: {expected_blockchain_records} (3 sensors × {total_csv_records} packages)")

# Check current blockchain state
total_records = 0
for device_id in device_ids:
    try:
        device_count = contract.functions.getDataCount(str(device_id)).call()
        total_records += device_count
    except Exception as e:
        print(f"⚠️ Error checking records for {device_id}: {e}")

print(f"📈 Current blockchain records: {total_records}")

# Decide whether to store data
if total_records == 0:
    print(f"\n⚠️ No data on blockchain! Need to store ALL {expected_blockchain_records} records...")
    store_data = True
elif total_records < expected_blockchain_records:
    print(f"\n⚠️ Incomplete data on blockchain! Have {total_records}, need {expected_blockchain_records}")
    store_data = True
else:
    print(f"\n✅ Complete data already on blockchain! ({total_records} records)")
    store_data = False

# Store ALL data if needed
if store_data:
    print(f"\n📤 Storing ALL IoT data to blockchain...")
    print(f"🚀 This will store {expected_blockchain_records} records total...")
    
    # Authorize the current account
    try:
        print("🔑 Authorizing device...")
        auth_txn = contract.functions.authorizeDevice(web3.eth.default_account).transact({
            'from': web3.eth.default_account,
            'gas': 3000000
        })
        web3.eth.wait_for_transaction_receipt(auth_txn)
        print("✅ Device authorized successfully!")
    except Exception as e:
        print(f"⚠️ Authorization issue (may already exist): {e}")
    
    # Store ALL data from CSV to blockchain
    stored_count = 0
    failed_count = 0
    
    print(f"📦 Processing ALL {len(df_original)} packages...")
    
    for index, row in df_original.iterrows():
        device_id = str(row["package_id"])
        
        # Store temperature, humidity, and shock data
        sensors = [
            ("temperature", str(row["temperature"])),
            ("humidity", str(row["humidity"])), 
            ("shock", str(row["shock"]))
        ]
        
        # Show progress every 10 packages
        if index % 10 == 0:
            print(f"📊 Progress: {index+1}/{len(df_original)} packages processed...")
        
        for sensor_type, sensor_value in sensors:
            try:
                txn = contract.functions.storeData(device_id, sensor_type, sensor_value).transact({
                    'from': web3.eth.default_account,
                    'gas': 3000000
                })
                web3.eth.wait_for_transaction_receipt(txn)
                stored_count += 1
                
                # Show detailed progress for first few records
                if index < 3:
                    print(f"   🛰️ Stored [{device_id}] - {sensor_type}: {sensor_value}")
                
                # Small delay to prevent overwhelming Ganache
                time.sleep(0.1)
                
            except Exception as e:
                failed_count += 1
                print(f"   ❌ Failed to store {sensor_type} for {device_id}: {e}")
    
    print(f"\n✅ Data storage completed!")
    print(f"📊 Successfully stored: {stored_count} records")
    print(f"❌ Failed to store: {failed_count} records")
    print(f"🎯 Success rate: {(stored_count/(stored_count+failed_count)*100):.1f}%")
    
    # Verify final count
    final_count = 0
    for device_id in device_ids:
        try:
            device_count = contract.functions.getDataCount(str(device_id)).call()
            final_count += device_count
        except:
            pass
    print(f"📈 Final blockchain record count: {final_count}")

# Step 3: Retrieve ALL data from blockchain
print(f"\n📥 Retrieving ALL IoT records from blockchain...")

# Get current total
total_records = 0
for device_id in device_ids:
    try:
        device_count = contract.functions.getDataCount(str(device_id)).call()
        total_records += device_count
    except:
        pass

if total_records == 0:
    print("❌ No data on blockchain to retrieve!")
    exit()

print(f"📊 Found {total_records} total records to retrieve...")

# Retrieve ALL records
data = []
retrieved_count = 0

for device_id in device_ids:
    try:
        device_count = contract.functions.getDataCount(str(device_id)).call()
        
        if device_count > 0:
            # Get original data for this device
            device_original_data = df_original[df_original["package_id"] == device_id].iloc[0]
            
            # Get each record for this device from blockchain
            for i in range(device_count):
                try:
                    # Get record from blockchain
                    blockchain_record = contract.functions.getDataByIndex(str(device_id), i).call()
                    
                    # Structure data following original instructions format
                    data.append({
                        "timestamp": device_original_data["timestamp"],
                        "device_id": device_id,
                        "data_type": blockchain_record[0],
                        "data_value": blockchain_record[1],
                        "location": device_original_data.get("location", "Unknown"),
                        "closest_city": device_original_data.get("closest_city", "Unknown"),
                        "status": device_original_data.get("status", "Unknown"),
                        "origin": device_original_data.get("origin", "Unknown"),
                        "destination": device_original_data.get("destination", "Unknown")
                    })
                    retrieved_count += 1
                    
                    # Show progress
                    if retrieved_count % 50 == 0:
                        print(f"📥 Retrieved {retrieved_count}/{total_records} records...")
                        
                except Exception as e:
                    print(f"❌ Error retrieving record {i} for {device_id}: {e}")
    except Exception as e:
        print(f"❌ Error processing device {device_id}: {e}")

print(f"✅ Successfully retrieved {len(data)} records from blockchain!")

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 4: Data preprocessing and cleaning
print(f"\n🔧 Processing {len(df)} records for analysis...")

# Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Extract numeric values
df["numeric_value"] = df["data_value"].str.extract(r'(\d+\.?\d*)').astype(float)

# Handle missing values
missing_count = df["numeric_value"].isnull().sum()
if missing_count > 0:
    print(f"⚠️ Found {missing_count} missing values, filling with 0")
    df["numeric_value"].fillna(0, inplace=True)

# Fill other missing values
df.fillna("Unknown", inplace=True)

# Step 5: Data quality summary
print(f"\n📋 COMPLETE Data Processing Summary:")
print(f"   📦 Total records processed: {len(df)}")
print(f"   📱 Unique packages: {df['device_id'].nunique()}")
print(f"   📈 Sensor types: {', '.join(df['data_type'].unique())}")
print(f"   🌍 Cities covered: {df['closest_city'].nunique()}")
print(f"   📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Show complete sensor data ranges
print(f"\n📊 Complete Sensor Data Analysis:")
for data_type in df['data_type'].unique():
    type_data = df[df['data_type'] == data_type]['numeric_value']
    if len(type_data) > 0:
        print(f"   {data_type}: {len(type_data)} readings")
        print(f"     Range: {type_data.min():.2f} - {type_data.max():.2f}")
        print(f"     Average: {type_data.mean():.2f} ± {type_data.std():.2f}")

# Step 6: Save the complete cleaned dataset
print(f"\n💾 Saving complete cleaned dataset...")

try:
    df.to_csv("cleaned_iot_data.csv", index=False)
    print("✅ Complete cleaned IoT data saved successfully as cleaned_iot_data.csv")
    print(f"📊 File contains {len(df)} records from {df['device_id'].nunique()} packages")
except Exception as e:
    print(f"❌ Error saving CSV: {e}")

# Step 7: Final verification
print(f"\n📈 Data ready for line chart visualization:")
print(f"   ✅ Total records: {len(df)}")
print(f"   ✅ Unique packages: {df['device_id'].nunique()}")
print(f"   ✅ Records per sensor type:")
for sensor in df['data_type'].unique():
    count = len(df[df['data_type'] == sensor])
    print(f"     {sensor}: {count} records")

print(f"\n🎉 SUCCESS! Complete dataset with {len(df)} records ready for Week 7 visualization!")
print(f"📁 Output file: cleaned_iot_data.csv")
print(f"🎯 This represents ALL your IoT data from blockchain storage!")

📥 Starting Week 6: Data Retrieval and Processing
🔗 Connecting to blockchain...
✅ Connected to Ganache successfully!
✅ Connected to Smart Contract

📁 Looking for logistics data file...
📋 CSV files found in current directory: ['cleaned_iot_data.csv', 'Logistics-Data.csv', 'W6-Cleaned_Iot_Data.csv']
✅ Found and loaded: Logistics-Data.csv
📊 Data shape: (100, 10)
📋 Columns: ['timestamp', 'package_id', 'origin', 'destination', 'location', 'closest_city', 'status', 'temperature', 'humidity', 'shock']
📄 Using data file: Logistics-Data.csv
📋 First few rows:
                    timestamp package_id     origin destination  \
0  2025-04-29 12:08:30.336512   PKG85046  São Paulo      Moscow   
1  2025-04-29 13:17:30.336512   PKG59811      Tokyo       Seoul   
2  2025-04-29 14:36:30.336512   PKG80313  Cape Town       Seoul   
3  2025-04-29 15:23:30.336512   PKG90642      Tokyo   São Paulo   
4  2025-04-29 16:06:30.336512   PKG86797     Moscow      Berlin   

            location closest_city      sta