# Example Exploratory Data Analysis

This notebook demonstrates basic exploratory data analysis workflow in Sweven Labs.

## Objectives
1. Load and inspect data
2. Perform statistical analysis
3. Create visualizations
4. Document findings

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Generate synthetic data for demonstration
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'age': np.random.randint(18, 80, n_samples),
    'income': np.random.normal(50000, 20000, n_samples),
    'score': np.random.uniform(0, 100, n_samples),
    'category': np.random.choice(['A', 'B', 'C'], n_samples)
})

print(f"Dataset shape: {df.shape}")
df.head()

## 2. Basic Statistics

In [None]:
# Summary statistics
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

## 3. Visualizations

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['age'], bins=30, edgecolor='black')
axes[0].set_title('Age Distribution')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['income'], bins=30, edgecolor='black', color='green')
axes[1].set_title('Income Distribution')
axes[1].set_xlabel('Income')
axes[1].set_ylabel('Frequency')

axes[2].hist(df['score'], bins=30, edgecolor='black', color='orange')
axes[2].set_title('Score Distribution')
axes[2].set_xlabel('Score')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[['age', 'income', 'score']].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Category distribution
category_counts = df['category'].value_counts()
plt.figure(figsize=(8, 6))
plt.bar(category_counts.index, category_counts.values)
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

## 4. Key Findings

### Summary
- Dataset contains 1000 samples with 4 features
- No missing values detected
- Age ranges from 18 to 80 with uniform distribution
- Income follows a normal distribution around $50,000
- Score is uniformly distributed between 0 and 100
- Categories are evenly distributed

### Next Steps
1. Feature engineering based on correlations
2. Data preprocessing and normalization
3. Model selection and training
4. Evaluation and iteration