In [1]:
# File: small_data_analysis.py
import pandas as pd
import time
import random

# Create small e-commerce data (like a local shop)
def create_small_dataset():
    products = ['iPhone', 'Samsung', 'OnePlus', 'Vivo', 'Oppo']
    data = []

    for i in range(100):  # Just 100 sales records
        data.append({
            'order_id': i+1,
            'product': random.choice(products),
            'price': random.randint(15000, 80000),
            'customer_city': random.choice(['Bangalore', 'Mumbai', 'Delhi', 'Chennai']),
            'rating': random.randint(1, 5)
        })

    return pd.DataFrame(data)

# Create and analyze small dataset
print("📊 SMALL DATA ANALYSIS")
df_small = create_small_dataset()

start_time = time.time()

# Simple analysis
print(f"Total orders: {len(df_small)}")
print(f"Average price: ₹{df_small['price'].mean():.0f}")
print("\nTop products:")
print(df_small['product'].value_counts())
print("\nCity-wise orders:")
print(df_small['customer_city'].value_counts())

end_time = time.time()
print(f"\n⏱️ Processing time: {end_time - start_time:.3f} seconds")

📊 SMALL DATA ANALYSIS
Total orders: 100
Average price: ₹50917

Top products:
product
Oppo       24
iPhone     24
Samsung    22
OnePlus    15
Vivo       15
Name: count, dtype: int64

City-wise orders:
customer_city
Mumbai       31
Chennai      26
Bangalore    22
Delhi        21
Name: count, dtype: int64

⏱️ Processing time: 0.011 seconds


In [2]:
# File: lab1_setup.py
import pandas as pd
import time
import random

print("✅ Big Data Analytics Lab 1")
print("✅ Python version check:")
print("✅ Pandas imported successfully!")

# Create a tiny dataset to start
small_store = {
    'product': ['iPhone', 'Samsung', 'OnePlus', 'iPhone', 'Samsung'],
    'price': [80000, 60000, 45000, 80000, 65000],
    'sold_date': ['2024-01-01', '2024-01-02', '2024-01-01', '2024-01-03', '2024-01-02']
}

df = pd.DataFrame(small_store)
print("\n📱 Small Electronics Store Data:")
print(df)
print(f"Dataset size: {len(df)} records")

✅ Big Data Analytics Lab 1
✅ Python version check:
✅ Pandas imported successfully!

📱 Small Electronics Store Data:
   product  price   sold_date
0   iPhone  80000  2024-01-01
1  Samsung  60000  2024-01-02
2  OnePlus  45000  2024-01-01
3   iPhone  80000  2024-01-03
4  Samsung  65000  2024-01-02
Dataset size: 5 records


In [3]:
# File: scaling_challenge.py
import pandas as pd
import time
import random

def create_dataset(size):
    """Create dataset of given size"""
    products = ['iPhone', 'Samsung', 'OnePlus', 'Vivo', 'Oppo', 'Xiaomi', 'Realme']
    cities = ['Bangalore', 'Mumbai', 'Delhi', 'Chennai', 'Pune', 'Hyderabad', 'Kolkata']

    data = []
    print(f"🔄 Creating {size:,} records...")

    for i in range(size):
        data.append({
            'order_id': i+1,
            'product': random.choice(products),
            'price': random.randint(5000, 100000),
            'customer_city': random.choice(cities),
            'rating': random.randint(1, 5),
            'order_date': f"2024-{random.randint(1,12):02d}-{random.randint(1,28):02d}"
        })

    return pd.DataFrame(data)

def analyze_data(df, dataset_name):
    """Analyze dataset and measure time"""
    print(f"\n📈 ANALYZING {dataset_name}")
    start_time = time.time()

    # Basic analytics
    total_orders = len(df)
    avg_price = df['price'].mean()
    top_product = df['product'].value_counts().head(1)

    end_time = time.time()
    processing_time = end_time - start_time

    print(f"Total orders: {total_orders:,}")
    print(f"Average price: ₹{avg_price:.0f}")
    print(f"Top product: {top_product.index[0]} ({top_product.iloc[0]} orders)")
    print(f"⏱️ Processing time: {processing_time:.3f} seconds")

    return processing_time

# Progressive challenge
datasets = [
    (1000, "1K Dataset (Small Shop)"),
    (10000, "10K Dataset (Chain Store)"),
    (50000, "50K Dataset (E-commerce Site)"),
    (100000, "100K Dataset (Big E-commerce)")
]

times = []
for size, name in datasets:
    df = create_dataset(size)
    process_time = analyze_data(df, name)
    times.append((size, process_time))

    print("="*50)

    # Ask students to observe
    if size == 50000:
        input("👀 Notice the processing time increasing? Press Enter to continue...")

# Show the performance degradation
print("\n📊 PERFORMANCE SUMMARY:")
print("Dataset Size | Processing Time")
print("-" * 30)
for size, ptime in times:
    print(f"{size:,} records | {ptime:.3f} seconds")

🔄 Creating 1,000 records...

📈 ANALYZING 1K Dataset (Small Shop)
Total orders: 1,000
Average price: ₹54324
Top product: Vivo (159 orders)
⏱️ Processing time: 0.001 seconds
🔄 Creating 10,000 records...

📈 ANALYZING 10K Dataset (Chain Store)
Total orders: 10,000
Average price: ₹52702
Top product: OnePlus (1492 orders)
⏱️ Processing time: 0.002 seconds
🔄 Creating 50,000 records...

📈 ANALYZING 50K Dataset (E-commerce Site)
Total orders: 50,000
Average price: ₹52505
Top product: OnePlus (7239 orders)
⏱️ Processing time: 0.002 seconds
🔄 Creating 100,000 records...

📈 ANALYZING 100K Dataset (Big E-commerce)
Total orders: 100,000
Average price: ₹52453
Top product: Samsung (14410 orders)
⏱️ Processing time: 0.004 seconds

📊 PERFORMANCE SUMMARY:
Dataset Size | Processing Time
------------------------------
1,000 records | 0.001 seconds
10,000 records | 0.002 seconds
50,000 records | 0.002 seconds
100,000 records | 0.004 seconds


In [4]:
# File: reality_check.py
import pandas as pd

# Show them the reality
print("🌍 BIG DATA REALITY CHECK")
print("="*40)

real_world_data = {
    'Company': ['Netflix', 'Amazon', 'Flipkart', 'Google', 'Facebook'],
    'Daily_Records': ['500 Million', '1 Billion', '100 Million', '8 Billion', '4 Billion'],
    'Total_Storage': ['15 Petabytes', '100+ Petabytes', '10+ Petabytes', '15+ Exabytes', '300+ Petabytes']
}

df_reality = pd.DataFrame(real_world_data)
print(df_reality.to_string(index=False))

print("\n🤯 SCALE COMPARISON:")
print("Your laptop just processed: 100,000 records")
print("Netflix processes daily: 500,000,000 records")
print("That's 5,000 times more data EVERY DAY!")

print("\n❓ QUESTIONS FOR YOU:")
print("1. How long would your laptop take to process Netflix's daily data?")
print("2. What if Netflix needed results in real-time (seconds)?")
print("3. What if your laptop crashes halfway through?")

# Simple calculation
your_time_per_100k = 0.1  # Assume 0.1 seconds for 100K records
netflix_daily = 500_000_000
time_needed = (netflix_daily / 100_000) * your_time_per_100k

print(f"\n🔢 MATH:")
print(f"Time to process Netflix's daily data on your laptop:")
print(f"{time_needed:.0f} seconds = {time_needed/60:.0f} minutes = {time_needed/3600:.1f} hours")

🌍 BIG DATA REALITY CHECK
 Company Daily_Records  Total_Storage
 Netflix   500 Million   15 Petabytes
  Amazon     1 Billion 100+ Petabytes
Flipkart   100 Million  10+ Petabytes
  Google     8 Billion   15+ Exabytes
Facebook     4 Billion 300+ Petabytes

🤯 SCALE COMPARISON:
Your laptop just processed: 100,000 records
Netflix processes daily: 500,000,000 records
That's 5,000 times more data EVERY DAY!

❓ QUESTIONS FOR YOU:
1. How long would your laptop take to process Netflix's daily data?
2. What if Netflix needed results in real-time (seconds)?
3. What if your laptop crashes halfway through?

🔢 MATH:
Time to process Netflix's daily data on your laptop:
500 seconds = 8 minutes = 0.1 hours
