# Notebook 01: Example Data Analysis

This notebook demonstrates how to use the common libraries and work with datasets.

In [None]:
# Add the project root to the path to import common libraries
import sys
from pathlib import Path

# Add common library to path
project_root = Path().absolute().parent.parent
sys.path.insert(0, str(project_root))

# Import common libraries
from common.lib import data_processing, visualization, utils

In [None]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load configuration
config = utils.load_config()
print("Project configuration:", config)

In [None]:
# Get dataset path for this notebook
dataset_path = utils.get_dataset_path('notebook_01')
print(f"Dataset path: {dataset_path}")

# List files in dataset directory
if dataset_path.exists():
    files = list(dataset_path.glob('*'))
    print(f"Files in dataset: {files}")
else:
    print("Dataset directory not found. Please add your data files to the dataset folder.")

## Example: Creating Sample Data

Here's an example of creating and processing sample data using the common libraries.

In [None]:
# Create sample data
np.random.seed(config.get('common_settings', {}).get('random_seed', 42))

sample_data = pd.DataFrame({
    'feature_1': np.random.randn(100),
    'feature_2': np.random.randn(100) * 2 + 5,
    'feature_3': np.random.randint(1, 100, 100),
    'target': np.random.choice(['A', 'B', 'C'], 100)
})

# Add some missing values
sample_data.loc[sample_data.index[:5], 'feature_1'] = np.nan

print("Sample data shape:", sample_data.shape)
sample_data.head()

In [None]:
# Get basic statistics using common library
stats = data_processing.get_basic_stats(sample_data)
print("Basic statistics:")
stats

In [None]:
# Clean missing values
cleaned_data = data_processing.clean_missing_values(sample_data, strategy='drop')
print(f"Data shape after cleaning: {cleaned_data.shape}")

## Visualization Examples

In [None]:
# Plot distribution using common visualization library
visualization.plot_distribution(
    cleaned_data['feature_1'], 
    title='Distribution of Feature 1'
)

In [None]:
# Plot scatter plot
visualization.plot_scatter(
    cleaned_data['feature_1'],
    cleaned_data['feature_2'],
    title='Feature 1 vs Feature 2',
    xlabel='Feature 1',
    ylabel='Feature 2'
)

In [None]:
# Plot correlation matrix
numeric_data = cleaned_data.select_dtypes(include=[np.number])
visualization.plot_correlation_matrix(
    numeric_data,
    title='Feature Correlation Matrix'
)

## Data Processing Examples

In [None]:
# Normalize a column
normalized_data = data_processing.normalize_column(cleaned_data, 'feature_3')
print("Feature 3 before normalization:")
print(f"Min: {cleaned_data['feature_3'].min()}, Max: {cleaned_data['feature_3'].max()}")
print("\nFeature 3 after normalization:")
print(f"Min: {normalized_data['feature_3'].min()}, Max: {normalized_data['feature_3'].max()}")

## Save Results

You can save your processed data back to the dataset folder.

In [None]:
# Example: Save processed data
# output_path = dataset_path / 'processed_data.csv'
# normalized_data.to_csv(output_path, index=False)
# print(f"Processed data saved to: {output_path}")