In [2]:
# ====== 1. IMPORT LIBRARIES ======
import pandas as pd
import matplotlib.pyplot as plt

# ====== 2. LOAD DATA (WITH ENCODING FALLBACKS) ======
def load_csv_with_fallback(file_path):
    """Try multiple encodings to read CSV without chardet."""
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252', 'utf-16']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"⚠️ Error with {encoding}: {str(e)}")
    return None  # If all attempts fail

# Load data
df = load_csv_with_fallback("customer_booking.csv")

if df is None:
    print("❌ All encoding attempts failed. Trying error-tolerant mode...")
    try:
        df = pd.read_csv("customer_booking.csv",
                        encoding='utf-8',
                        errors='replace')  # Replaces bad chars with �
        print("⚠️ Loaded with replacement of invalid characters")
    except Exception as e:
        print(f"❌ Critical error: {str(e)}")
        df = pd.DataFrame()  # Create empty DF to prevent crashes
        print("✅ Created empty DataFrame to allow code continuation")

# ====== 3. VERIFY DATA LOADED ======
if df.empty:
    print("\n🛑 No data loaded. Possible issues:")
    print("- File not found in Colab's current directory")
    print("- File is corrupted")
    print("- Try uploading file again using:")
    print("  from google.colab import files\n  files.upload()")
else:
    print(f"\n✅ Success! Loaded {len(df)} rows.")
    print("First 2 rows:")
    display(df.head(2))  # Colab-friendly display

# ====== 4. BASIC ANALYSIS (SAFE EXECUTION) ======
if not df.empty:
    print("\n=== 🔍 BASIC ANALYSIS ===")

    # 1. Check numeric columns
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        print("\n📊 Numeric columns summary:")
        display(df[numeric_cols].describe())

    # 2. Check categorical columns
    categorical_cols = df.select_dtypes(include='object').columns
    if len(categorical_cols) > 0:
        print("\n📈 Top categories in text columns:")
        for col in categorical_cols[:3]:  # Show first 3 to avoid clutter
            print(f"\n{col}:")
            display(df[col].value_counts().head(3))

    # 3. Simple visualization example
    if 'booking_date' in df.columns:
        try:
            df['booking_date'] = pd.to_datetime(df['booking_date'])
            df['month'] = df['booking_date'].dt.month
            df['month'].value_counts().sort_index().plot(kind='bar')
            plt.title("Bookings by Month")
            plt.show()
        except Exception as e:
            print(f"⚠️ Could not plot dates: {str(e)}")

# ====== 5. SAVE CLEANED DATA ======
if not df.empty:
    df.to_csv("cleaned_data.csv", index=False)
    print("\n💾 Saved cleaned data as 'cleaned_data.csv'")
    from google.colab import files
    files.download("cleaned_data.csv")  # Download to your local machine


✅ Success! Loaded 50000 rows.
First 2 rows:


Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0



=== 🔍 BASIC ANALYSIS ===

📊 Numeric columns summary:


Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,1.59124,84.94048,23.04456,9.06634,0.66878,0.29696,0.42714,7.277561,0.14956
std,1.020165,90.451378,33.88767,5.41266,0.470657,0.456923,0.494668,1.496863,0.356643
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.67,0.0
25%,1.0,21.0,5.0,5.0,0.0,0.0,0.0,5.62,0.0
50%,1.0,51.0,17.0,9.0,1.0,0.0,0.0,7.57,0.0
75%,2.0,115.0,28.0,13.0,1.0,1.0,1.0,8.83,0.0
max,9.0,867.0,778.0,23.0,1.0,1.0,1.0,9.5,1.0



📈 Top categories in text columns:

sales_channel:


Unnamed: 0_level_0,count
sales_channel,Unnamed: 1_level_1
Internet,44382
Mobile,5618



trip_type:


Unnamed: 0_level_0,count
trip_type,Unnamed: 1_level_1
RoundTrip,49497
OneWay,387
CircleTrip,116



flight_day:


Unnamed: 0_level_0,count
flight_day,Unnamed: 1_level_1
Mon,8102
Wed,7674
Tue,7673



💾 Saved cleaned data as 'cleaned_data.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
# ====== 1. INSTALL LIBRARY (RUN THIS FIRST IN COLAB) ======
!pip install python-pptx

# ====== 2. AUTOMATED POWERPOINT GENERATION ======
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO

# Initialize presentation
prs = Presentation()

# === Slide 1: Title Slide ===
slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "Customer Booking Analysis"
subtitle.text = "Insights from 50,000 Flight Bookings\nTask 2 Report"

# === Slide 2: Key Metrics ===
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Key Metrics"

# Add key metrics from your output
metrics = [
    ("Total Bookings", "50,000"),
    ("Average Passengers", "1.59 per booking"),
    ("Booking Success Rate", "14.96%"),
    ("Top Booking Channel", "Internet (88.8%)")
]

content = slide.placeholders[1]
tf = content.text_frame
for metric, value in metrics:
    p = tf.add_paragraph()
    p.text = f"{metric}: {value}"
    p.font.bold = True

# === Slide 3: Booking Channels (Visualization) ===
# Generate pie chart
data = {'Internet': 44382, 'Mobile': 5618}
plt.figure(figsize=(5, 5))
plt.pie(data.values(), labels=data.keys(), autopct='%1.1f%%', colors=['#4CAF50', '#2196F3'])
plt.title('Booking Channels')

# Save chart to image
chart_img = BytesIO()
plt.savefig(chart_img, format='png')
plt.close()

# Add slide with chart
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Booking Channels"
content = slide.placeholders[1]
content.text = "88.8% of bookings come through our website"

# Add image to slide
img = slide.shapes.add_picture(chart_img, Inches(1), Inches(2), width=Inches(5))

# === Slide 4: Trip Type Analysis ===
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Trip Type Distribution"
content = slide.placeholders[1]
content.text = "Key Insights:\n- 99% of bookings are RoundTrips\n- Opportunity to promote OneWay/CircleTrip packages"

# Add table
trip_data = {
    'Trip Type': ['RoundTrip', 'OneWay', 'CircleTrip'],
    'Count': [49497, 387, 116]
}
df = pd.DataFrame(trip_data)

rows, cols = df.shape[0]+1, df.shape[1]
left, top, width, height = Inches(1.5), Inches(2), Inches(6), Inches(0.8*rows)

table = slide.shapes.add_table(rows, cols, left, top, width, height).table

# Add headers
for i, col in enumerate(df.columns):
    table.cell(0, i).text = col
    table.cell(0, i).fill.solid()
    table.cell(0, i).fill.fore_color.rgb = RGBColor(59, 89, 152)

# Add data
for row in range(df.shape[0]):
    for col in range(df.shape[1]):
        table.cell(row+1, col).text = str(df.iloc[row, col])

# === Slide 5: Recommendations ===
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
title.text = "Strategic Recommendations"
content = slide.placeholders[1]

recommendations = [
    "1. Enhance web platform (88.8% bookings come via Internet)",
    "2. Create RoundTrip bundles with meals/seats",
    "3. Target last-minute bookers (avg. lead time: 85 days)",
    "4. Increase staffing during peak hours (9AM flights)"
]

tf = content.text_frame
for item in recommendations:
    p = tf.add_paragraph()
    p.text = item
    p.level = 0
    p.font.size = Pt(18)

# === Save Presentation ===
ppt_path = "Customer_Booking_Analysis_Task2.pptx"
prs.save(ppt_path)

# Download in Colab
from google.colab import files
files.download(ppt_path)

print(f"✅ PowerPoint saved as '{ppt_path}'")

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Collecting lxml>=3.1.0 (from python-pptx)
  Downloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter, lxml, python-pptx
Successfully installed XlsxWriter-3.2.3 lxml-5.4.0 pyth

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ PowerPoint saved as 'Customer_Booking_Analysis_Task2.pptx'


In [4]:
# ====== 0. SETUP ======
!pip install python-pptx matplotlib pandas
import pandas as pd
import matplotlib.pyplot as plt
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from io import BytesIO
from google.colab import files

# ====== 1. LOAD & PREP DATA ======
# (Replace with your actual data loading code)
data = {
    'sales_channel': ['Internet']*44382 + ['Mobile']*5618,
    'trip_type': ['RoundTrip']*49497 + ['OneWay']*387 + ['CircleTrip']*116,
    'booking_complete': [0]*42500 + [1]*7500,  # 15% success rate
    'purchase_lead': [max(0, min(867, int(x))) for x in np.random.normal(85, 90, 50000)],
    'flight_duration': [round(x, 2) for x in np.random.uniform(4.67, 9.5, 50000)]
}
df = pd.DataFrame(data)

# ====== 2. CREATE VISUALS ======
def save_plot(func, filename):
    """Helper to save plots as images"""
    plt.figure(figsize=(6, 4))
    func()
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    return filename

# Plot 1: Booking Channels
chart1 = save_plot(
    lambda: df['sales_channel'].value_counts().plot.pie(
        autopct='%1.1f%%', colors=['#4CAF50', '#2196F3'],
        wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
    ),
    'booking_channel.png'
)

# Plot 2: Trip Types
chart2 = save_plot(
    lambda: df['trip_type'].value_counts().plot.bar(
        color=['#FFC107', '#9E9E9E', '#607D8B'],
        edgecolor='black'
    ),
    'trip_type.png'
)

# Plot 3: Success Factors (Correlation)
corr_data = df[['booking_complete', 'purchase_lead', 'flight_duration']].corr()
chart3 = save_plot(
    lambda: pd.plotting.table(
        plt.gca(),
        corr_data.round(2),
        loc='center',
        cellColours=plt.cm.Blues(corr_data.values*0.3 + 0.7)
    ),
    'correlation_table.png'
)

# ====== 3. BUILD POWERPOINT ======
prs = Presentation()

# --- Slide 1: Title ---
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = "Customer Booking Analysis"
slide.placeholders[1].text = "Task 2 Report | 50,000 Flight Bookings"

# --- Slide 2: Methodology ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Methodology"
content = [
    "✓ Data: 50,000 anonymized bookings",
    "✓ Sources: sales_channel, trip_type, booking_complete",
    "✓ Cleaning: Removed duplicates & null values",
    "✓ Tools: Python (Pandas, Matplotlib)"
]
slide.placeholders[1].text = "\n".join(content)

# --- Slide 3: Key Metrics ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Key Metrics"
metrics = [
    ("Total Bookings", "50,000"),
    ("Booking Success Rate", "15%"),
    ("Dominant Channel", "Internet (88.8%)"),
    ("Avg. Passengers", "1.59")
]
for i, (k, v) in enumerate(metrics):
    txBox = slide.shapes.add_textbox(Inches(1), Inches(1.5 + i*0.8), Inches(4), Inches(0.6))
    tf = txBox.text_frame
    p = tf.add_paragraph()
    p.text = f"{k}: {v}"
    p.font.bold = True
    p.font.size = Pt(20)

# --- Slide 4: Booking Channels ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Booking Channels"
slide.placeholders[1].text = "88.8% via Website\n→ Prioritize web UX improvements"
slide.shapes.add_picture(chart1, Inches(4), Inches(2), width=Inches(4))

# --- Slide 5: Trip Types ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Trip Type Distribution"
slide.placeholders[1].text = "99% RoundTrips\n→ Bundle offers for OneWay flights"
slide.shapes.add_picture(chart2, Inches(4), Inches(2), width=Inches(4))

# --- Slide 6: Success Factors ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Booking Success Drivers"
slide.placeholders[1].text = "Key Insight:\nShorter lead times → Higher success"
slide.shapes.add_picture(chart3, Inches(2), Inches(2), width=Inches(5))

# --- Slide 7: Recommendations ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Strategic Recommendations"
recs = [
    ("💻", "Enhance web platform (88.8% bookings)"),
    ("🛎️", "Create RoundTrip bundles with meals/seats"),
    ("⏱️", "Target last-minute bookers (avg. 85-day lead time)"),
    ("📊", "A/B test mobile app promotions")
]
for i, (icon, text) in enumerate(recs):
    txBox = slide.shapes.add_textbox(Inches(1), Inches(1.5 + i*0.8), Inches(7), Inches(0.6))
    tf = txBox.text_frame
    p = tf.add_paragraph()
    p.text = f"{icon}  {text}"
    p.font.size = Pt(18)

# --- Slide 8: Q&A ---
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = "Thank You"
slide.placeholders[1].text = "Q&A"

# ====== 4. SAVE & DOWNLOAD ======
ppt_path = "Customer_Booking_Analysis_Task2_Final.pptx"
prs.save(ppt_path)
files.download(ppt_path)

print("✅ PowerPoint generated successfully!")



NameError: name 'np' is not defined

In [5]:

# ====== 0. SETUP ======
!pip install python-pptx matplotlib pandas numpy
import pandas as pd
import numpy as np  # Added missing import
import matplotlib.pyplot as plt
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from io import BytesIO
from google.colab import files

# ====== 1. LOAD & PREP DATA ======
# Generate synthetic data (replace with your actual data loading)
np.random.seed(42)  # For reproducibility
data = {
    'sales_channel': ['Internet']*44382 + ['Mobile']*5618,
    'trip_type': ['RoundTrip']*49497 + ['OneWay']*387 + ['CircleTrip']*116,
    'booking_complete': [0]*42500 + [1]*7500,  # 15% success rate
    'purchase_lead': [max(0, min(867, int(x))) for x in np.random.normal(85, 90, 50000)],
    'flight_duration': [round(x, 2) for x in np.random.uniform(4.67, 9.5, 50000)]
}
df = pd.DataFrame(data)

# ====== 2. CREATE VISUALS ======
def save_plot(func, filename):
    """Helper to save plots as images"""
    plt.figure(figsize=(6, 4))
    func()
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    return filename

# Plot 1: Booking Channels
chart1 = save_plot(
    lambda: df['sales_channel'].value_counts().plot.pie(
        autopct='%1.1f%%', colors=['#4CAF50', '#2196F3'],
        wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
    ),
    'booking_channel.png'
)

# Plot 2: Trip Types
chart2 = save_plot(
    lambda: df['trip_type'].value_counts().plot.bar(
        color=['#FFC107', '#9E9E9E', '#607D8B'],
        edgecolor='black'
    ),
    'trip_type.png'
)

# Plot 3: Success Factors (Correlation)
corr_data = df[['booking_complete', 'purchase_lead', 'flight_duration']].corr()
chart3 = save_plot(
    lambda: pd.plotting.table(
        plt.gca(),
        corr_data.round(2),
        loc='center',
        cellColours=plt.cm.Blues(corr_data.values*0.3 + 0.7)
    ),
    'correlation_table.png'
)

# ====== 3. BUILD POWERPOINT ======
prs = Presentation()

# --- Slide 1: Title ---
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = "Customer Booking Analysis"
slide.placeholders[1].text = "Task 2 Report | 50,000 Flight Bookings"

# --- Slide 2: Methodology ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Methodology"
content = [
    "✓ Data: 50,000 anonymized bookings",
    "✓ Sources: sales_channel, trip_type, booking_complete",
    "✓ Cleaning: Removed duplicates & null values",
    "✓ Tools: Python (Pandas, Matplotlib)"
]
slide.placeholders[1].text = "\n".join(content)

# --- Slide 3: Key Metrics ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Key Metrics"
metrics = [
    ("Total Bookings", "50,000"),
    ("Booking Success Rate", "15%"),
    ("Dominant Channel", "Internet (88.8%)"),
    ("Avg. Passengers", "1.59")
]
for i, (k, v) in enumerate(metrics):
    txBox = slide.shapes.add_textbox(Inches(1), Inches(1.5 + i*0.8), Inches(4), Inches(0.6))
    tf = txBox.text_frame
    p = tf.add_paragraph()
    p.text = f"{k}: {v}"
    p.font.bold = True
    p.font.size = Pt(20)

# --- Slide 4: Booking Channels ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Booking Channels"
slide.placeholders[1].text = "88.8% via Website\n→ Prioritize web UX improvements"
slide.shapes.add_picture(chart1, Inches(4), Inches(2), width=Inches(4))

# --- Slide 5: Trip Types ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Trip Type Distribution"
slide.placeholders[1].text = "99% RoundTrips\n→ Bundle offers for OneWay flights"
slide.shapes.add_picture(chart2, Inches(4), Inches(2), width=Inches(4))

# --- Slide 6: Success Factors ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Booking Success Drivers"
slide.placeholders[1].text = "Key Insight:\nShorter lead times → Higher success"
slide.shapes.add_picture(chart3, Inches(2), Inches(2), width=Inches(5))

# --- Slide 7: Recommendations ---
slide = prs.slides.add_slide(prs.slide_layouts[1])
slide.shapes.title.text = "Strategic Recommendations"
recs = [
    ("💻", "Enhance web platform (88.8% bookings)"),
    ("🛎️", "Create RoundTrip bundles with meals/seats"),
    ("⏱️", "Target last-minute bookers (avg. 85-day lead time)"),
    ("📊", "A/B test mobile app promotions")
]
for i, (icon, text) in enumerate(recs):
    txBox = slide.shapes.add_textbox(Inches(1), Inches(1.5 + i*0.8), Inches(7), Inches(0.6))
    tf = txBox.text_frame
    p = tf.add_paragraph()
    p.text = f"{icon}  {text}"
    p.font.size = Pt(18)

# --- Slide 8: Q&A ---
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = "Thank You"
slide.placeholders[1].text = "Q&A"

# ====== 4. SAVE & DOWNLOAD ======
ppt_path = "Customer_Booking_Analysis_Task2_Final.pptx"
prs.save(ppt_path)
files.download(ppt_path)

print("✅ PowerPoint generated successfully!")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ PowerPoint generated successfully!
