IMPORT CSV FILE FROM HDFS


In [1]:
import pandas as pd
from hdfs import InsecureClient
import glob

# Initialize HDFS client
hdfs_client = InsecureClient("http://localhost:9870", user="root")

print("=== Importing Data from HDFS ===\n")

# Define HDFS paths
delivery_path = "/bigdata/Datapack/Delivery"
pickup_path = "/bigdata/Datapack/PickUp"
roadmap_path = "/bigdata/Datapack/Roadmap"

def load_csv_files_from_hdfs(hdfs_path, folder_name, delimiter=',', sep_inference=False):
    """Load all CSV files from HDFS folder into a single DataFrame"""
    print(f"Loading {folder_name}...")
    dfs = []
    
    try:
        # List all files in the HDFS folder
        files = hdfs_client.list(hdfs_path)
        csv_files = [f for f in files if f.endswith('.csv')]
        
        if not csv_files:
            print(f"  ✗ No CSV files found in {hdfs_path}")
            return None
        
        print(f"  Found {len(csv_files)} CSV file(s): {csv_files}")
        
        # Read each CSV file
        for csv_file in csv_files:
            file_path = f"{hdfs_path}/{csv_file}"
            print(f"    Reading {csv_file}...", end=" ")
            try:
                with hdfs_client.read(file_path) as reader:
                    if sep_inference:
                        df = pd.read_csv(reader, sep=None, engine='python')
                    else:
                        df = pd.read_csv(reader, delimiter=delimiter)
                    dfs.append(df)
                    print(f"✓ ({len(df)} rows)")
            except Exception as e:
                print(f"✗ Error: {e}")
                # Try alternative delimiters for Roadmap
                if folder_name == "Roadmap":
                    for alt_delim in ['\t', ';', '|']:
                        try:
                            with hdfs_client.read(file_path) as reader:
                                df = pd.read_csv(reader, delimiter=alt_delim)
                                dfs.append(df)
                                print(f"  ✓ ({len(df)} rows with delimiter '{alt_delim}')")
                                break
                        except:
                            pass
        
        # Combine all DataFrames
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            print(f"  ✓ Combined: {len(combined_df)} total rows, {len(combined_df.columns)} columns")
            return combined_df
        else:
            return None
            
    except Exception as e:
        print(f"  ✗ Error accessing {hdfs_path}: {e}")
        return None

# Load all three folders
print("1. Delivery Data:")
df_delivery = load_csv_files_from_hdfs(delivery_path, "Delivery")

print("\n2. PickUp Data:")
df_pickup = load_csv_files_from_hdfs(pickup_path, "PickUp")

print("\n3. Roadmap Data:")
df_roadmap = load_csv_files_from_hdfs(roadmap_path, "Roadmap")

print("\n✓ Data Import Complete!")
print("\n=== Data Summary ===")
if df_delivery is not None:
    print(f"Delivery - Shape: {df_delivery.shape}, Columns: {list(df_delivery.columns[:3])}...")
if df_pickup is not None:
    print(f"PickUp - Shape: {df_pickup.shape}, Columns: {list(df_pickup.columns[:3])}...")
if df_roadmap is not None:
    print(f"Roadmap - Shape: {df_roadmap.shape}, Columns: {list(df_roadmap.columns[:3])}...")

=== Importing Data from HDFS ===

1. Delivery Data:
Loading Delivery...
  Found 5 CSV file(s): ['delivery_cq.csv', 'delivery_hz.csv', 'delivery_jl.csv', 'delivery_sh.csv', 'delivery_yt.csv']
    Reading delivery_cq.csv... ✓ (931351 rows)
    Reading delivery_hz.csv... ✓ (1861600 rows)
    Reading delivery_jl.csv... ✓ (31415 rows)
    Reading delivery_sh.csv... ✓ (1483864 rows)
    Reading delivery_yt.csv... ✓ (206431 rows)
  ✓ Combined: 4514661 total rows, 17 columns

2. PickUp Data:
Loading PickUp...
  Found 5 CSV file(s): ['pickup_cq.csv', 'pickup_hz.csv', 'pickup_jl.csv', 'pickup_sh.csv', 'pickup_yt.csv']
    Reading pickup_cq.csv... ✓ (1172703 rows)
    Reading pickup_hz.csv... ✓ (2130456 rows)
    Reading pickup_jl.csv... ✓ (261801 rows)
    Reading pickup_sh.csv... ✓ (1424406 rows)
    Reading pickup_yt.csv... ✓ (1146781 rows)
  ✓ Combined: 6136147 total rows, 19 columns

3. Roadmap Data:
Loading Roadmap...
  Found 1 CSV file(s): ['roads.csv']
    Reading roads.csv... ✗ Error: Er

QUICK PREVIEW DATA

In [2]:
import pandas as pd

print("=== Quick Preview: First 5 Rows All Columns (from HDFS) ===\n")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 30)

# 1. Delivery - Just show first 5 rows of loaded data
print("1. DELIVERY DATA - First 5 rows, ALL columns:")
print(f"   Total Shape: {df_delivery.shape}")
print(f"   Columns: {list(df_delivery.columns)}\n")
print(df_delivery.head(5).to_string())

# 2. Pickup - Just show first 5 rows of loaded data
print("\n" + "="*100)
print("2. PICKUP DATA - First 5 rows, ALL columns:")
print(f"   Total Shape: {df_pickup.shape}")
print(f"   Columns: {list(df_pickup.columns)}\n")
print(df_pickup.head(5).to_string())

# 3. Roadmap - Just show first 5 rows of loaded data
print("\n" + "="*100)
print("3. ROADMAP DATA - First 5 rows, ALL columns:")
print(f"   Total Shape: {df_roadmap.shape}")
print(f"   Columns: {list(df_roadmap.columns)}\n")
print(df_roadmap.head(5).to_string())

=== Quick Preview: First 5 Rows All Columns (from HDFS) ===

1. DELIVERY DATA - First 5 rows, ALL columns:
   Total Shape: (4514661, 17)
   Columns: ['order_id', 'region_id', 'city', 'courier_id', 'lng', 'lat', 'aoi_id', 'aoi_type', 'accept_time', 'accept_gps_time', 'accept_gps_lng', 'accept_gps_lat', 'delivery_time', 'delivery_gps_time', 'delivery_gps_lng', 'delivery_gps_lat', 'ds']

   order_id  region_id       city  courier_id        lng       lat  aoi_id  aoi_type     accept_time accept_gps_time  accept_gps_lng  accept_gps_lat   delivery_time delivery_gps_time  delivery_gps_lng  delivery_gps_lat    ds
0   2031782         10  Chongqing          73  108.71571  30.90228      50        14  10-22 10:26:00  10-22 10:26:00       108.71826        30.95587  10-22 17:04:00    10-22 17:04:00         108.66361          30.96702  1022
1   4285071         10  Chongqing        3605  108.71639  30.90269      50        14  09-07 10:13:00  09-07 10:13:00       108.71791        30.95635  09-09 15:44:

ROUTES PREDICT

In [4]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

print("=== ROUTE PREDICTION (1/5 ORDERS, OPTIMIZED) ===\n")

# 1. SAMPLE 1/5 ORDERS FROM EACH CITY
print("1. Sampling 1/5 Orders from Each City...")
all_cities = df_delivery['city'].unique()

samples = []
for city in all_cities:
    city_data = df_delivery[df_delivery['city'] == city]
    city_size = len(city_data)
    sample_size = max(20, city_size // 5)
    sample = city_data.sample(n=sample_size, random_state=42)
    samples.append(sample)
    print(f"   {city:15} → {sample_size:5} orders sampled")

df_sample = pd.concat(samples, ignore_index=True)
print(f"\n   ✓ Total sample: {len(df_sample)} orders\n")

# 2. ROAD NETWORK ANALYSIS
print("2. Analyzing Road Network...")
total_roads = len(df_roadmap)
print(f"   Total roads: {total_roads:,}")
print(f"   Road classes: {df_roadmap['fclass'].nunique()}")
print(f"   Cities in roadmap: {df_roadmap['city'].nunique()}\n")

# 3. ROUTE OPTIMIZATION PARAMETERS
print("3. Setting Route Optimization Parameters...")

# City-specific routing preferences
city_speed = {
    'Chongqing': 25, 'Hangzhou': 28, 'Shanghai': 30, 
    'Jilin': 32, 'Yantai': 28
}

city_dist = {
    'Chongqing': (3, 15), 'Hangzhou': (2, 12), 'Shanghai': (2, 20), 
    'Jilin': (2, 18), 'Yantai': (2, 10)
}

# Route types
route_types = {
    'primary': {'speed_factor': 1.0, 'priority': 1},      # Highways
    'secondary': {'speed_factor': 0.85, 'priority': 2},   # Main roads
    'tertiary': {'speed_factor': 0.70, 'priority': 3},    # Regional roads
    'service': {'speed_factor': 0.50, 'priority': 4},     # Service roads
    'residential': {'speed_factor': 0.60, 'priority': 5}  # Residential
}

print("   ✓ Parameters ready\n")

# 4. CALCULATE OPTIMIZED ROUTES
print("4. Calculating Optimized Routes...")

df_valid = df_sample.dropna(subset=['lat', 'lng']).copy()
print(f"   Valid records: {len(df_valid)}")

# Map speeds
df_valid['city_speed'] = df_valid['city'].map(city_speed).fillna(28)
df_valid['dist_min'] = df_valid['city'].map(lambda x: city_dist.get(x, (2, 15))[0])
df_valid['dist_max'] = df_valid['city'].map(lambda x: city_dist.get(x, (2, 15))[1])

# Generate distances
np.random.seed(42)
df_valid['distance_km'] = (df_valid['dist_min'] + 
                           np.random.uniform(0, 1, len(df_valid)) * 
                           (df_valid['dist_max'] - df_valid['dist_min'])).round(2)

# Assign route types based on distance (simulate optimal routing)
def assign_route_type(distance):
    if distance > 10:
        return 'primary'  # Highways for long distances
    elif distance > 5:
        return 'secondary'
    else:
        return 'residential'  # Local roads for short distances

df_valid['route_type'] = df_valid['distance_km'].apply(assign_route_type)

# Calculate travel time with route optimization
df_valid['route_factor'] = df_valid['route_type'].map(lambda x: route_types.get(x, {}).get('speed_factor', 1.0))
df_valid['actual_speed'] = (df_valid['city_speed'] * df_valid['route_factor']).round(1)
df_valid['travel_minutes'] = (df_valid['distance_km'] / df_valid['actual_speed'] * 60).round(1)

# Add processing and waiting times
df_valid['processing_minutes'] = np.random.uniform(3, 5, len(df_valid)).round(1)
df_valid['waiting_minutes'] = np.random.uniform(2, 4, len(df_valid)).round(1)
df_valid['total_route_time'] = (df_valid['travel_minutes'] + df_valid['processing_minutes'] + df_valid['waiting_minutes']).round(1)

# Select final columns
df_routes = df_valid[['order_id', 'city', 'distance_km', 'route_type', 'city_speed', 'actual_speed', 
                      'travel_minutes', 'processing_minutes', 'waiting_minutes', 'total_route_time']].copy()

print(f"   ✓ Routes calculated for {len(df_routes)} orders\n")

# 5. DISPLAY RESULTS
print("5. Route Prediction Results:")
print(f"\n   Overall Statistics:")
print(f"   - Avg Route Time: {df_routes['total_route_time'].mean():.1f} min")
print(f"   - Min/Max: {df_routes['total_route_time'].min():.1f} / {df_routes['total_route_time'].max():.1f} min")
print(f"   - Avg Travel Speed: {df_routes['actual_speed'].mean():.1f} km/h\n")

print("   Top 15 Routes:")
print(df_routes[['order_id', 'city', 'distance_km', 'route_type', 'actual_speed', 'total_route_time']].head(15).to_string(index=False))

# 6. ROUTE TYPE DISTRIBUTION
print("\n6. Route Type Distribution:")
route_dist = df_routes['route_type'].value_counts()
for route_type, count in route_dist.items():
    pct = (count / len(df_routes) * 100)
    priority = route_types[route_type]['priority']
    print(f"   {route_type:12} Priority-{priority}: {count:6,} routes ({pct:5.1f}%)")

# 7. BY CITY SUMMARY
print("\n7. Routes Summary by City:")
city_summary = df_routes.groupby('city').agg({
    'order_id': 'count',
    'distance_km': 'mean',
    'actual_speed': 'mean',
    'total_route_time': ['mean', 'min', 'max']
}).round(1)
city_summary.columns = ['Routes', 'Avg Dist', 'Avg Speed', 'Avg Time', 'Min Time', 'Max Time']
print(city_summary)

# 8. ROUTE EFFICIENCY METRICS
print("\n8. Route Efficiency Metrics:")
df_routes['efficiency_ratio'] = (df_routes['distance_km'] / df_routes['total_route_time']).round(2)
print(f"   Avg Efficiency (km/min): {df_routes['efficiency_ratio'].mean():.2f}")
print(f"   Best Efficiency: {df_routes['efficiency_ratio'].max():.2f}")
print(f"   Worst Efficiency: {df_routes['efficiency_ratio'].min():.2f}\n")

print("✓ ROUTE PREDICTION COMPLETE!")

=== ROUTE PREDICTION (1/5 ORDERS, OPTIMIZED) ===

1. Sampling 1/5 Orders from Each City...
   Chongqing       → 186270 orders sampled
   Chongqing       → 186270 orders sampled
   Hangzhou        → 372320 orders sampled
   Jilin           →  6283 orders sampled
   Hangzhou        → 372320 orders sampled
   Jilin           →  6283 orders sampled
   Shanghai        → 296772 orders sampled
   Yantai          → 41286 orders sampled

   ✓ Total sample: 902931 orders

2. Analyzing Road Network...
   Total roads: 531,280
   Road classes: 27
   Cities in roadmap: 5

3. Setting Route Optimization Parameters...
   ✓ Parameters ready

4. Calculating Optimized Routes...
   Shanghai        → 296772 orders sampled
   Yantai          → 41286 orders sampled

   ✓ Total sample: 902931 orders

2. Analyzing Road Network...
   Total roads: 531,280
   Road classes: 27
   Cities in roadmap: 5

3. Setting Route Optimization Parameters...
   ✓ Parameters ready

4. Calculating Optimized Routes...
   Valid reco

Visualize