# Assignment 1:
### - The automated_stat_analyzer Function
- Scenario: A retail company needs a utility to quickly summarize sales data. Students must create a function that identifies the 
"Central Tendency" and "Dispersion" of any numerical column.
- ### Requirements:

* Accept a Pandas DataFrame and a column name.

* Calculate the Mean, Median, and Standard Deviation .

* Identify if the data is "Skewed" by comparing the Mean and Median.


* Bonus: If the column is categorical, return the Mode instead.

### Your Data

In [16]:
import pandas as pd
import numpy as np

def automated_stat_analyzer(df, column_name: str):
    """
    Summarizes a specified column in a Pandas DataFrame.

    For numerical columns, it calculates the mean, median, and standard deviation
    to describe central tendency and dispersion. It also determines whether the
    data is skewed by comparing the mean and median.

    For categorical columns, it returns the mode (most frequent value).

    :param df: The input Pandas DataFrame.
    :type df: pandas.DataFrame
    :param column_name: The name of the column to analyze.
    :type column_name: str
    :return: A dictionary containing the calculated statistics or mode.
    :rtype: dict
    """
    
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist.")  # Check if the column exists

    column = df[column_name]  # Select the specified column
    result = {}  # Initialize dictionary to store results

    if pd.api.types.is_numeric_dtype(column):  # Check if the column is numeric
        column_data = column.dropna()  # Remove NaN values for calculations

        if column_data.empty:
            raise ValueError("Column contains no valid numeric data.")  

        mean = column_data.mean()  
        median = column_data.median() 
        std = column_data.std()  
        if mean > median:
            skewness = "right_skewed"  # Mean > median → right skew
        elif mean < median:
            skewness = "left_skewed"  # Mean < median → left skew
        else:
            skewness = "symmetric"  # Mean = median → symmetric distribution

        result.update({
            "type": "numeric",  
            "mean": mean,  
            "median": median,  
            "std": std,  
            "skewness": skewness  
        })

    else:
        mode_value = column.mode() 
        result.update({
            "type": "categorical",  
            "mode": mode_value.iloc[0] if not mode_value.empty else None  
        })

    return result  # Return the result dictionary


# Example dataset
data = {
    "Transaction_ID": range(1, 11),  # Transaction IDs from 1 to 10
    "Product_Category": [
        "Electronics", "Home", "Electronics", "Sports", "Home",
        "Electronics", "Home", "Sports", "Electronics", "Electronics"
    ],  
    "Sales_Amount": [150, 200, 155, 300, 210, 180, 205, 1000, 190, 160], 
    "Customer_Age": [25, 34, np.nan, 45, 23, 31, 29, np.nan, 38, 40], 
    "Rating": [5, 4, 3, 5, 2, 4, 5, 2, 4, 3]  
}

df_test = pd.DataFrame(data) 

print(automated_stat_analyzer(df_test, "Sales_Amount"))  
print("\n")
print(automated_stat_analyzer(df_test, "Product_Category"))  


{'type': 'numeric', 'mean': np.float64(275.0), 'median': np.float64(195.0), 'std': np.float64(258.30645021412494), 'skewness': 'right_skewed'}


{'type': 'categorical', 'mode': 'Electronics'}


In [6]:
df_test.head()

Unnamed: 0,Transaction_ID,Product_Category,Sales_Amount,Customer_Age,Rating
0,1,Electronics,150,25.0,5
1,2,Home,200,34.0,4
2,3,Electronics,155,,3
3,4,Sports,300,45.0,5
4,5,Home,210,23.0,2


In [None]:
import pandas as pd

def automated_stat_analyzer(df, column_name: str):
    """
    Summarizes a specified column in a Pandas DataFrame.

    For numerical columns, it calculates the mean, median, and standard deviation
    to describe central tendency and dispersion. It also determines whether the
    data is skewed by comparing the mean and median.

    For categorical columns, it returns the mode (most frequent value).

    :param df: The input Pandas DataFrame.
    :type df: pandas.DataFrame
    :param column_name: The name of the column to analyze.
    :type column_name: str
    :return: A dictionary containing the calculated statistics or mode.
    :rtype: dict
    """
    
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist.")  # Check if column exists in DataFrame

    column = df[column_name]  # Select the specified column
    result = {}  # Initialize dictionary to store results

    if pd.api.types.is_numeric_dtype(column):  # Check if the column is numeric
        column_data = column.dropna()  # Remove NaN values before calculations

        if column_data.empty:
            raise ValueError("Column contains no valid numeric data.") 

        mean = column_data.mean()  
        median = column_data.median() 
        std = column_data.std() 
        result.update({
            "type": "numeric",
            "mean": mean,  
            "median": median,  
            "std": std  
        })
    else:
        mode_value = column.mode()  # Calculate mode for categorical columns
        result.update({
            "type": "categorical",  
            "mode": mode_value.iloc[0] if not mode_value.empty else None  
        })

    return result  


# Example usage
result_numeric = automated_stat_analyzer(df_test, "Sales_Amount")  
print(result_numeric)  

result_categorical = automated_stat_analyzer(df_test, "Product_Category")  
print(result_categorical)  


{'type': 'numeric', 'mean': np.float64(275.0), 'median': np.float64(195.0), 'std': np.float64(258.30645021412494)}
{'type': 'categorical', 'mode': 'Electronics'}


## Assignment 2: 
  ### The null_handling_strategy Function


#### Scenario: Incoming user data often has missing values.Students must implement a flexible strategy to handle these "Null Values" to prepare data for Machine Learning.
### Requirements:

* Check for null values in the DataFrame.

* Apply a strategy based on parameters: "drop_rows", "fill_mean", or "fill_median" .

* Ensure the function only fills numerical columns when using mean or median.

In [15]:
import pandas as pd
import numpy as np

def null_handling_strategy(df, strategy="fill_mean"):
    """
    Clean a DataFrame by resolving missing (NaN) values.

    Only numeric columns are filled when using mean or median.

    :param df: Input Pandas DataFrame
    :param strategy: Strategy to handle nulls: "drop_rows", "fill_mean", or "fill_median"
    :return: Cleaned DataFrame
    """
    df_cleaned = df.copy()  # Work on a copy to avoid modifying the original DataFrame

    if df.isnull().sum().sum() > 0:  # Check if there are any missing values in the entire DataFrame
        if strategy == "drop_rows":
            df_cleaned = df_cleaned.dropna()  # Drop any row that contains a NaN value
        elif strategy == "fill_mean":
            num_cols = df_cleaned.select_dtypes(include='number').columns  # Select only numeric columns
            df_cleaned[num_cols] = df_cleaned[num_cols].fillna(df_cleaned[num_cols].mean())  # Fill NaN with column mean
        elif strategy == "fill_median":
            num_cols = df_cleaned.select_dtypes(include='number').columns  # Select only numeric columns
            df_cleaned[num_cols] = df_cleaned[num_cols].fillna(df_cleaned[num_cols].median())  # Fill NaN with column median
        else:
            raise ValueError("Invalid strategy! Use 'drop_rows', 'fill_mean', or 'fill_median'.")  # Handle invalid input
    return df_cleaned  


# Example data
data = {
    "Customer_Age": [25, 34, np.nan, 45, 23],
    "Sales_Amount": [150, 200, 155, np.nan, 210],
    "Product_Category": ["Electronics", "Home", "Electronics", "Sports", None]
}

df = pd.DataFrame(data)


cleaned_df = null_handling_strategy(df, strategy="fill_mean")
print(cleaned_df)


   Customer_Age  Sales_Amount Product_Category
0         25.00        150.00      Electronics
1         34.00        200.00             Home
2         31.75        155.00      Electronics
3         45.00        178.75           Sports
4         23.00        210.00              NaN
