## Pandas Testing using pytest

[Pandas Testing Tutorial Link](https://machinelearningtutorials.org/pandas-testing-tutorial-with-examples/)  
[Pytest Advanced Tutorial](https://www.softwaretestinghelp.com/pytest-tutorial/)

1. Testing DataFrames and Series
    - Checking Data Types
    - Comparing DataFrames
    - Handling Missing Values

2. Testing Data Transformation Operations
    - Filtering and Sorting
    - Aggregation and Grouping
    - Merging

In [70]:
import pandas as pd

In [71]:
# Sample test data
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'A', 'B'],
    'quantity': [12, 15, 8, 5, 20]
})

Checking Data Types

In [72]:
def test_dataframe_dtypes():  
    assert df.dtypes['product'] == object
    assert df.dtypes['quantity'] == int, 'quantity col must be int type'

In [73]:
test_dataframe_dtypes()

Comparing DataFrames

In [74]:
def test_dataframe_transformation():
    df = pd.DataFrame({
        'col1': [1, 2, 3],
        'col2': ['A', 'B', 'C']
    })
    
    transformed_df = df.copy()
    transformed_df['col1'] = df['col1'] * 2
    
    expected_df = pd.DataFrame({
        'col1': [2, 4, 6],
        'col2': ['A', 'B', 'C']
    })
    
    assert transformed_df.equals(expected_df), 'DataFrames are not equal'

In [75]:
test_dataframe_transformation()

Handling Missing values

In [76]:
def test_missing_values():  
    # Test for missing values
    assert df['product'].isnull().sum() == 0
    assert df['quantity'].notnull().all()

In [77]:
test_missing_values()

### Testing Data Transformation Operations

In [78]:
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'A', 'B'],
    'quantity': [12, 15, 8, 5, 20]
})

Filtering and Sorting

In [79]:
def test_filtering():   
    # Apply filtering
    filtered_df = df[df['quantity'] >= 10]
    
    assert len(filtered_df) == 3
    
    
def test_sorting():
    # Apply sorting
    sorted_df = df.sort_values(by='quantity', ascending=False)
    
    assert sorted_df['quantity'].is_monotonic_decreasing
    
       
test_filtering()
test_sorting()

Aggregation and Grouping

In [80]:
def test_aggregation():
    # Apply aggregation
    total_quantity = df['quantity'].sum()

    # Test the sum of aggregated values
    assert total_quantity == 60
    
def test_grouping():
    # Apply aggregation
    grouped_df = df.groupby('product')['quantity'].mean()

    # Test the sum of aggregated values
    assert grouped_df['A'] == 8.5
    assert grouped_df['B'] == 17.5
    assert grouped_df['C'] == 8
    
test_aggregation()
test_grouping()

Merging

In [81]:
# Sample customer data
customers = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

# Sample order data
orders = pd.DataFrame({
    'customer_id': [1, 2, 1, 3],
    'order_amount': [100, 150, 80, 200]
})

def test_merging():
    # Merge customer and order data
    merged_data = pd.merge(customers, orders, on='customer_id')

    # Test the number of rows
    assert len(merged_data) == 4

    # Test order amounts
    assert merged_data['order_amount'].sum() == 530
    

test_merging()

### Parameterization  

Parametrization is used to combine the multiple test cases into one test case. With parameterized testing, we can test functions and classes with different multiple sets of arguments.

In [89]:
import pytest
import pandas as pd


df1 = pd.DataFrame({
    'product': ['Apple', 'Banana', 'Orange', 'Apple', 'Banana'],
    'quantity': [25, 18, 12, 30, 22]
})

df2 = pd.DataFrame({
    'product': ['Book', 'Pen', 'Notebook', 'Book', 'Pen'],
    'quantity': [100, 75, 50, 120, 90]
})

df3 = pd.DataFrame({
    'product': ['Shirt', 'Pants', 'Jacket', 'Shirt', 'Pants'],
    'quantity': [35, 42, 28, 40, 38]
})

def get_quantity_sum(df):
    return df['quantity'].sum()

@pytest.mark.parametrize("test_df, expected_quantity_sum", [(df1, 107), (df2, 435), (df3, 183)])
def test_quantity_sum(test_df, expected_quantity_sum):
    result = get_quantity_sum(test_df)
    assert result == expected_quantity_sum