In [1]:
import pandas as pd
import sqlite3
import random
from contextlib import contextmanager

def generate_sample_data(num_records=1000):
    years = list(range(1900, 2021))
    data = {
        'record_id': range(1, num_records + 1),
        'year': [random.choice(years) for _ in range(num_records)],
        'temperature_c': [random.uniform(-50, 50) for _ in range(num_records)]
    }
    df = pd.DataFrame(data)
    csv_path = '/content/weather_data.csv'
    df.to_csv(csv_path, index=False)
    print(f"Sample data generated and saved to {csv_path} (simulating HDFS file).")
    return csv_path

@contextmanager
def sqlite_connection(db_name):
    conn = sqlite3.connect(db_name)
    try:
        yield conn
    finally:
        conn.close()

def sqoop_like_import(csv_path, db_name, table_name):
    df = pd.read_csv(csv_path)
    print(f"Sqoop-like export: Read {len(df)} records from {csv_path} (HDFS).")

    with sqlite_connection(db_name) as conn:
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"Sqoop-like import: Loaded data into {db_name}.{table_name} (Hive table).")

        conn.execute(f'CREATE INDEX idx_year ON {table_name}(year)')
        print(f"Index 'idx_year' created on {table_name}.year.")

def generate_weather_report(db_name, table_name):
    with sqlite_connection(db_name) as conn:
        query = f'''
            SELECT year,
                   MIN(temperature_c) AS min_temp_c,
                   MAX(temperature_c) AS max_temp_c
            FROM {table_name}
            GROUP BY year
            ORDER BY year
        '''
        report_df = pd.read_sql_query(query, conn)
        report_df['min_temp_c'] = report_df['min_temp_c'].round(1)
        report_df['max_temp_c'] = report_df['max_temp_c'].round(1)
    return report_df

if __name__ == "__main__":
    print("=== Simulating Sqoop Export/Import to Hive ===")

    csv_path = generate_sample_data(1000)

    db_name = 'weather_hive.db'
    table_name = 'weather_data'
    sqoop_like_import(csv_path, db_name, table_name)

    print("\nGenerating Weather Temperature Statistics Report...")
    report = generate_weather_report(db_name, table_name)

    print("\n=== Weather Report ===")
    print("Year\tMin Temp (째C)\tMax Temp (째C)")
    print("-" * 35)
    for _, row in report.iterrows():
        print(f"{int(row['year'])}\t{row['min_temp_c']}\t\t{row['max_temp_c']}")

    print(f"\nSample data from {table_name} (first 5 rows):")
    with sqlite_connection(db_name) as conn:
        sample_data = pd.read_sql_query(f'SELECT * FROM {table_name} LIMIT 5', conn)
        print(sample_data)

=== Simulating Sqoop Export/Import to Hive ===
Sample data generated and saved to /content/weather_data.csv (simulating HDFS file).
Sqoop-like export: Read 1000 records from /content/weather_data.csv (HDFS).
Sqoop-like import: Loaded data into weather_hive.db.weather_data (Hive table).
Index 'idx_year' created on weather_data.year.

Generating Weather Temperature Statistics Report...

=== Weather Report ===
Year	Min Temp (째C)	Max Temp (째C)
-----------------------------------
1900	-47.8		44.0
1901	-16.1		37.7
1902	-48.3		38.5
1903	-45.6		47.0
1904	-45.7		34.8
1905	-29.1		28.3
1906	-45.0		47.8
1907	-27.9		46.6
1908	-46.7		30.7
1909	-40.5		45.0
1910	-23.7		26.0
1911	-38.6		36.6
1912	-42.5		49.1
1913	-24.0		39.2
1914	-48.2		40.7
1915	-37.5		38.6
1916	-22.0		47.3
1917	-43.4		50.0
1918	-38.0		40.1
1919	-46.2		26.4
1920	-47.0		34.0
1921	-32.0		45.8
1922	-32.0		46.9
1923	-41.2		33.2
1924	-46.4		48.2
1925	-47.4		6.1
1926	-47.7		44.7
1927	-42.2		48.1
1928	-43.8		39.4
1929	-25.6		48.6
1930	-23.9	