In [2]:
base_folder = '/Users/chengxiaowei/Desktop/Python/eas503/Xiaowei_finalProject'
%cd "{base_folder}"

import sqlite3
import pandas as pd

# 你的 db 放哪里就改这里（建议放 base_folder/data/ 下面）
db_path = f"{base_folder}/xiaowei_data/flightdelays.db"

conn = sqlite3.connect(db_path)

flight = pd.read_sql_query(
    """
    SELECT
        f.flight_id,
        f.date,
        f.schedtime,
        f.deptime,
        f.distance,
        f.flightnumber,
        f.weather,
        f.dayweek,
        f.daymonth,
        c.code  AS carrier,
        a1.code AS origin,
        a2.code AS dest,
        t.tailnu AS tailnu,
        f.delay
    FROM flight AS f
    JOIN dim_carrier AS c ON c.carrier_id = f.carrier_id
    JOIN dim_airport  AS a1 ON a1.airport_id = f.origin_airport_id
    JOIN dim_airport  AS a2 ON a2.airport_id = f.dest_airport_id
    JOIN dim_tail     AS t ON t.tail_id = f.tail_id
    ORDER BY f.flight_id
    """,
    conn,
)

conn.close()

flight.head()


/Users/chengxiaowei/Desktop/Python/eas503/Xiaowei_finalProject


Unnamed: 0,flight_id,date,schedtime,deptime,distance,flightnumber,weather,dayweek,daymonth,carrier,origin,dest,tailnu,delay
0,0,1/1/2004,1455,1455,184,5935,0,4,1,OH,BWI,JFK,N940CA,ontime
1,1,1/1/2004,1640,1640,213,6155,0,4,1,DH,DCA,JFK,N405FJ,ontime
2,2,1/1/2004,1245,1245,229,7208,0,4,1,DH,IAD,LGA,N695BR,ontime
3,3,1/1/2004,1715,1709,229,7215,0,4,1,DH,IAD,LGA,N662BR,ontime
4,4,1/1/2004,1039,1035,229,7792,0,4,1,DH,IAD,LGA,N698BR,ontime


In [3]:
import json

print("=" * 80)
print("ANALYZING FLIGHT DELAYS DATA FOR STREAMLIT APP")
print("=" * 80)

# ========= 你可以按你最终模型用到的列来改 =========
numerical_features = [
    "schedtime",
    "distance",
    "dayweek",
    "daymonth",
    "flightnumber",
]

categorical_features = [
    "weather",   # 0/1 放这里
    "carrier",
    "origin",
    "dest",
]


# Create schema dictionary
data_schema = {
    "numerical": {},
    "categorical": {}
}

# Analyze numerical features
print("\n" + "-" * 80)
print("NUMERICAL FEATURES")
print("-" * 80)
print(f"{'Feature':<25} {'Min':<15} {'Max':<15} {'Mean':<15} {'Median':<15}")
print("-" * 80)

for feature in numerical_features:
    # 防止列里有字符串数字，先强制转数值
    col = pd.to_numeric(flight[feature], errors="coerce")

    min_val = float(col.min())
    max_val = float(col.max())
    mean_val = float(col.mean())
    median_val = float(col.median())

    data_schema["numerical"][feature] = {
        "min": min_val,
        "max": max_val,
        "mean": mean_val,
        "median": median_val
    }

    print(f"{feature:<25} {min_val:<15.2f} {max_val:<15.2f} {mean_val:<15.2f} {median_val:<15.2f}")

# Analyze categorical features
print("\n" + "-" * 80)
print("CATEGORICAL FEATURES")
print("-" * 80)

for feature in categorical_features:
    # 去掉空值，并转成字符串（Streamlit 下拉框更稳）
    series = flight[feature].dropna().astype(str)

    unique_values = sorted(series.unique().tolist())
    value_counts = series.value_counts().to_dict()

    if feature == "weather":
        series = pd.to_numeric(series, errors="coerce").dropna().astype(int)
        unique_values = [0, 1]
        value_counts = series.value_counts().to_dict()
    else:
        series = series.astype(str).str.strip()
        unique_values = sorted(series.unique().tolist())
        value_counts = series.value_counts().to_dict()

    data_schema["categorical"][feature] = {
        "unique_values": unique_values,
        "value_counts": value_counts
    }

    print(f"\n{feature}:")
    print(f"  Unique values: {unique_values}")
    print(f"  Value counts:")
    for value, count in value_counts.items():
        print(f"    {value}: {count} ({count/len(flight)*100:.1f}%)")

# 保存（建议别覆盖 housing 的 schema，单独一个文件）
output_file = f"{base_folder}/xiaowei_data/flightdelays_schema.json"
with open(output_file, "w") as f:
    json.dump(data_schema, f, indent=2)

print("\n" + "=" * 80)
print(f"✓ Data schema saved to {output_file}")
print("=" * 80)

print("\n" + "-" * 80)
print("GENERATED SCHEMA (flightdelays_schema.json)")
print("-" * 80)
print(json.dumps(data_schema, indent=2))

print("\n" + "=" * 80)
print("DONE! Use flightdelays_schema.json in your Streamlit app")
print("=" * 80)


ANALYZING FLIGHT DELAYS DATA FOR STREAMLIT APP

--------------------------------------------------------------------------------
NUMERICAL FEATURES
--------------------------------------------------------------------------------
Feature                   Min             Max             Mean            Median         
--------------------------------------------------------------------------------
schedtime                 600.00          2130.00         1371.94         1455.00        
distance                  169.00          229.00          211.87          214.00         
dayweek                   1.00            7.00            3.91            4.00           
daymonth                  1.00            31.00           16.02           16.00          
flightnumber              746.00          7924.00         3815.09         2385.00        

--------------------------------------------------------------------------------
CATEGORICAL FEATURES
-----------------------------------------------