In [None]:
# Step 1: Set Up the Environment
# Objective: Ensure you have the right tools and packages installed.

# 1. Install the required libraries.
# 2. Verify the installation by importing the libraries in a Python script or Jupyter notebook





# Step 2: Load & Explore the Dataset
# Objective: Load data into a pandas DataFrame and obtain a basic understanding of its structure.

# 3. Load a CSV file into a DataFrame.
# 4. Display the first few records to understand the structure.
# 5. Get a summary of the dataset.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
print("All libraries imported successfully!")

# Step 2: Load & Explore the Dataset
# 3. Load a CSV file into a DataFrame
# We'll use a sample dataset built into seaborn for demonstration
df = sns.load_dataset('tips')
print("\nDataset loaded successfully!")

# 4. Display the first few records
print("\nFirst 5 records:")
print(df.head())

# 5. Get a summary of the dataset
print("\nDataset summary:")
print(df.info())
print("\nDescriptive statistics:")
print(df.describe())

# Step 3: Perform NumPy Operations
# 6. Convert a DataFrame column to NumPy array and perform operations
total_bill_array = df['total_bill'].to_numpy()
print("\nNumPy array operations:")
print(f"Mean: {np.mean(total_bill_array):.2f}")
print(f"Sum: {np.sum(total_bill_array):.2f}")

# 7. Create a NumPy array and calculate variance/std
random_array = np.random.rand(10)
print("\nRandom array stats:")
print(f"Variance: {np.var(random_array):.4f}")
print(f"Standard Deviation: {np.std(random_array):.4f}")

# 8. Use NumPy to filter based on conditions
large_bills = total_bill_array[total_bill_array > 30]
print(f"\nNumber of bills > $30: {len(large_bills)}")

# Step 4: Data Manipulation with Pandas
# 9. Handle missing data (though our sample has none)
print("\nMissing values before cleaning:")
print(df.isnull().sum())

# 10. Create new columns
df['tip_percentage'] = (df['tip'] / df['total_bill']) * 100
print("\nDataFrame with new tip_percentage column:")
print(df.head())

# 11. Use groupby to aggregate data
print("\nAverage tip percentage by day:")
print(df.groupby('day')['tip_percentage'].mean())

# Step 5: Data Visualization
# 12. Matplotlib basic plot
plt.figure(figsize=(10, 6))
plt.subplot(2, 2, 1)
plt.scatter(df['total_bill'], df['tip'])
plt.title('Total Bill vs Tip')
plt.xlabel('Total Bill')
plt.ylabel('Tip')

# 13. Seaborn histogram
plt.subplot(2, 2, 2)
sns.histplot(df['total_bill'], kde=True)
plt.title('Total Bill Distribution')

# 14. Box plot
plt.subplot(2, 2, 3)
sns.boxplot(x='day', y='total_bill', data=df)
plt.title('Total Bill by Day')
plt.tight_layout()
plt.show()
print("\nAll steps completed successfully!")






# Step 3: Perform NumPy Operations
# Objective: Utilize NumPy for basic numerical operations and array manipulations.

# 6. Convert a DataFrame column to a NumPy array and perform array operations like mean and sum.
# 7. Create a NumPy array and calculate the variance and standard deviation.
# 8. Use NumPy to filter based on conditions.








# Step 4: Data Manipulation with Pandas
# Objective: Use Pandas to clean and manipulate dataset for analysis.

# 9. Handle missing data by filling or dropping.
# 10. Create new columns or modify existing ones.
# 11. Use groupby to aggregate data.







# Step 5: Data Visualization with Matplotlib & Seaborn
# Objective: Visualize the data to identify patterns, trends, and insights.

# 12. Use Matplotlib to create a basic plot.
# 13. Create a histogram using Seaborn.
# 14. Plot a box plot for a clear view of data distribution.





