# Data Analysis Example

This notebook demonstrates VS Code's Jupyter integration features using our DataProcessor class.

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Import our DataProcessor class
import sys
sys.path.append('..')
from src.data_processor import DataProcessor

## Creating Sample Data

Let's create some sample data to work with.

In [None]:
# Generate sample data
np.random.seed(42)
sample_data = pd.DataFrame({
    'id': range(1, 101),
    'value': np.random.normal(100, 15, 100),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'date': pd.date_range(start='2024-01-01', periods=100)
})

# Add some null values
sample_data.loc[sample_data.sample(10).index, 'value'] = None

sample_data.head()

## Initialize DataProcessor

Create an instance of our DataProcessor class with the sample data.

In [None]:
processor = DataProcessor(sample_data)

# Generate a summary of the data
summary = processor.generate_summary()
print("Data Summary:")
for key, value in summary.items():
    print(f"{key}: {value}")

## Data Cleaning

Clean the data by removing null values and duplicates.

In [None]:
# Check missing values before cleaning
print("Missing values before cleaning:")
print(processor.data.isnull().sum())

# Clean the data
processor.clean_data()

# Check missing values after cleaning
print("\nMissing values after cleaning:")
print(processor.data.isnull().sum())

## Statistical Analysis

Calculate and visualize statistics for the 'value' column.

In [None]:
# Calculate statistics
stats = processor.calculate_statistics('value')
print("Statistics for 'value' column:")
for key, value in stats.items():
    print(f"{key}: {value:.2f}")

# Create a histogram
plt.figure(figsize=(10, 6))
plt.hist(processor.data['value'], bins=20, edgecolor='black')
plt.title('Distribution of Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## Data Filtering

Demonstrate filtering capabilities.

In [None]:
# Filter data for category 'A' with values greater than the mean
conditions = [
    ('value', 'greater_than', stats['mean']),
    ('category', 'equals', 'A')
]

filtered_data = processor.filter_data(conditions)
print(f"Found {len(filtered_data)} records matching the conditions")
filtered_data.head()

## Interactive Features

This demonstrates VS Code's variable explorer and data viewer capabilities.

In [None]:
# Create some variables for the variable explorer
array_example = np.array([1, 2, 3, 4, 5])
dict_example = {"a": 1, "b": 2, "c": [3, 4, 5]}
series_example = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

# These variables should now be visible in the variable explorer
print("Check the Variables window to explore these objects!")