In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
from functions import BasicDataDHandelFunctions

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)


In [None]:
file_path = '../data/sample.csv'
df = BasicDataDHandelFunctions.load_dataset(file_path=file_path)

In [None]:
# Basic Dataset Information
def data_overview(df):
    """Displays basic information about the dataset."""
    print("Dataset Info:")
    display(df.info())
    print("\nBasic Statistics:")
    display(df.describe())

if df is not None:
    data_overview(df)


In [None]:
# Missing Values Analysis
def analyze_missing_values(df):
    """Displays missing values and visualizes them."""
    missing_values = df.isnull().sum()
    print("Missing Values per Column:")
    display(missing_values[missing_values > 0])

    # Visualize missing values
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title("Missing Values Heatmap")
    plt.show()

if df is not None:
    analyze_missing_values(df)


In [None]:
# Data Type and Unique Value Analysis
def analyze_data_types(df):
    """Analyzes data types and unique values of categorical features."""
    print("Data Types:")
    display(df.dtypes)

    # Unique values for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        print(f"\nUnique values in '{col}':")
        display(df[col].unique())

if df is not None:
    analyze_data_types(df)


In [None]:
# Data Distribution Visualization
def plot_data_distributions(df):
    """Plots histograms for numeric columns."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols].hist(figsize=(15, 10), bins=30)
    plt.tight_layout()
    plt.show()

if df is not None:
    plot_data_distributions(df)


In [None]:
# Correlation Matrix Visualization
def plot_correlation_matrix(df):
    """Plots the correlation matrix for numeric columns."""
    plt.figure(figsize=(12, 10))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()

if df is not None:
    plot_correlation_matrix(df)


In [None]:
# Handling Missing Values
def handle_missing_values(df, strategy='drop'):
    """
    Handles missing values based on the selected strategy.
    Options: 'drop', 'median', 'mode'.
    """
    if strategy == 'drop':
        df_cleaned = df.dropna()
        logging.info("Dropped rows with missing values.")
    elif strategy == 'median':
        df_cleaned = df.fillna(df.median())
        logging.info("Filled missing values with the median.")
    elif strategy == 'mode':
        df_cleaned = df.fillna(df.mode().iloc[0])
        logging.info("Filled missing values with the mode.")
    else:
        logging.warning("Invalid strategy selected. Returning original DataFrame.")
        return df

    return df_cleaned

if df is not None:
    df_cleaned = handle_missing_values(df, strategy='median')
    display(df_cleaned.head())


In [None]:
# Encoding Categorical Variables
from sklearn.preprocessing import LabelEncoder

def encode_categorical(df):
    """Encodes categorical variables using Label Encoding."""
    le = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    logging.info("Categorical columns encoded successfully.")
    return df

if df_cleaned is not None:
    df_encoded = encode_categorical(df_cleaned)
    display(df_encoded.head())


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

def scale_features(df):
    """Scales numeric features using StandardScaler."""
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    logging.info("Numeric features scaled successfully.")
    return df

if df_encoded is not None:
    df_scaled = scale_features(df_encoded)
    display(df_scaled.head())


In [None]:
# Remove Duplicates
def remove_duplicates(df):
    """Removes duplicate rows from the dataset."""
    initial_count = df.shape[0]
    df_cleaned = df.drop_duplicates()
    final_count = df_cleaned.shape[0]
    logging.info(f"Removed {initial_count - final_count} duplicate rows.")
    return df_cleaned

if df is not None:
    df = remove_duplicates(df)
    display(df.head())


In [None]:
# String Cleaning Function
def clean_strings(df, columns):
    """
    Cleans strings in specified columns by:
    - Stripping whitespace
    - Converting to lowercase
    - Removing special characters
    """
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x.strip().lower()))
            logging.info(f"Cleaned strings in column '{col}'.")
        else:
            logging.warning(f"Column '{col}' not found in the DataFrame.")
    return df

# Specify the columns to clean
string_columns = ['name', 'category']  # Replace with your actual columns

if df is not None:
    df = clean_strings(df, string_columns)
    display(df.head())


In [None]:
# Handling Missing Values
def handle_missing_values(df, strategy='drop', fill_value=None):
    """
    Handles missing values in the dataset.
    Options:
    - 'drop': Drops rows with missing values.
    - 'fill': Fills missing values with a specified value.
    - 'median': Fills numeric columns with the median.
    """
    if strategy == 'drop':
        df_cleaned = df.dropna()
        logging.info("Dropped rows with missing values.")
    elif strategy == 'fill' and fill_value is not None:
        df_cleaned = df.fillna(fill_value)
        logging.info(f"Filled missing values with '{fill_value}'.")
    elif strategy == 'median':
        df_cleaned = df.fillna(df.median())
        logging.info("Filled missing values with the median.")
    else:
        logging.warning("Invalid strategy selected. Returning original DataFrame.")
        return df

    return df_cleaned

if df is not None:
    df = handle_missing_values(df, strategy='median')
    display(df.head())


In [None]:
# Detect and Handle Outliers
def detect_and_handle_outliers(df, columns, method='IQR'):
    """
    Detects and handles outliers in specified numeric columns using the IQR method.
    - Replaces outliers with the median of the column.
    """
    for col in columns:
        if col in df.columns and df[col].dtype in [np.float64, np.int64]:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Replace outliers with the median
            median = df[col].median()
            outlier_condition = (df[col] < lower_bound) | (df[col] > upper_bound)
            outliers = df[outlier_condition]

            df.loc[outlier_condition, col] = median
            logging.info(f"Handled {len(outliers)} outliers in column '{col}'.")
        else:
            logging.warning(f"Column '{col}' is not numeric or not found.")

    return df

# Specify numeric columns to check for outliers
numeric_columns = ['price', 'quantity']  # Replace with your actual numeric columns

if df is not None:
    df = detect_and_handle_outliers(df, numeric_columns)
    display(df.head())


In [None]:
# Text Normalization (for NLP)
def normalize_text(df, column):
    """
    Normalizes text in the specified column:
    - Converts to lowercase
    - Removes punctuation
    - Removes multiple spaces
    """
    if column in df.columns:
        df[column] = df[column].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x.lower())).strip())
        logging.info(f"Normalized text in column '{column}'.")
    else:
        logging.warning(f"Column '{column}' not found.")

# Specify the text column to normalize
text_column = 'description'  # Replace with your actual text column

if df is not None:
    df = normalize_text(df, text_column)
    display(df.head())


In [None]:
if df is not None:
    BasicDataDHandelFunctions.save_dataset_csv(df)