In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import logging
from notebooks.functions import BasicDataDHandelFunctions
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)


In [None]:
file_path = '../data/sample.csv'
df = BasicDataDHandelFunctions.load_dataset(file_path)

In [None]:
# Feature Creation - Derived Features
def week_of_month(date):
    """Returns the week number within the month for a given date."""
    first_day_of_month = date.replace(day=1)
    return (date.day + first_day_of_month.weekday()) // 7 + 1

def create_derived_features(df):
    """Creates new features based on existing ones."""
    if 'price' in df.columns and 'quantity' in df.columns:
        df['total_sales'] = df['price'] * df['quantity']
        logging.info("Created 'total_sales' feature as price * quantity.")
    else:
        logging.warning("Columns 'price' or 'quantity' not found for derived feature creation.")
    
    if 'date' in df.columns:
        df['year'] = pd.to_datetime(df['date']).dt.year
        df['month'] = pd.to_datetime(df['date']).dt.month
        df['week_number'] = pd.to_datetime(df['date']).dt.isocalendar().week
        df['day_of_week_year'] = pd.to_datetime(df['date']).dt.dayofweek
        df['day_of_week_month'] = pd.to_datetime(df['date']).apply(week_of_month)
        logging.info("Extracted 'year', 'month', and 'day_of_week' from 'date'.")
    else:
        logging.warning("Column 'date' not found for date-based feature extraction.")
    
    return df

if df is not None:
    df = create_derived_features(df)
    display(df.head())


In [None]:
# Feature Transformation - Scaling Numeric Features
def scale_numeric_features(df, columns, method='standard'):
    """Scales numeric features using StandardScaler or MinMaxScaler."""
    scaler = StandardScaler() if method == 'standard' else MinMaxScaler()
    
    for col in columns:
        if col in df.columns:
            df[col] = scaler.fit_transform(df[[col]])
            logging.info(f"Scaled '{col}' using {method} scaling.")
        else:
            logging.warning(f"Column '{col}' not found for scaling.")
    
    return df

# Specify numeric columns to scale
numeric_columns = ['price', 'quantity', 'total_sales']

if df is not None:
    df = scale_numeric_features(df, numeric_columns, method='standard')
    display(df.head())


In [None]:
# Feature Encoding - Categorical Variables
def encode_categorical_features(df, columns):
    """Encodes categorical features using Label Encoding."""
    le = LabelEncoder()
    for col in columns:
        if col in df.columns:
            df[col] = le.fit_transform(df[col].astype(str))
            logging.info(f"Encoded categorical column '{col}'.")
        else:
            logging.warning(f"Column '{col}' not found for encoding.")
    return df

# Specify categorical columns to encode
categorical_columns = ['category', 'day_of_week']

if df is not None:
    df = encode_categorical_features(df, categorical_columns)
    display(df.head())


In [None]:
# Text Feature Extraction - TF-IDF Vectorization
def extract_text_features(df, text_column, max_features=100):
    """Extracts text features using TF-IDF Vectorization."""
    if text_column in df.columns:
        vectorizer = TfidfVectorizer(max_features=max_features)
        tfidf_matrix = vectorizer.fit_transform(df[text_column].fillna(''))
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
        logging.info(f"Extracted TF-IDF features from '{text_column}'.")
        return pd.concat([df.drop(columns=[text_column]), tfidf_df], axis=1)
    else:
        logging.warning(f"Column '{text_column}' not found for text feature extraction.")
        return df

# Specify text column for TF-IDF extraction
text_column = 'description'

if df is not None:
    df = extract_text_features(df, text_column)
    display(df.head())


In [None]:
# Automated Feature Selection
def select_best_features(df, target_column, num_features=5):
    """
    Selects the best features using ANOVA F-statistic and returns the transformed DataFrame.
    
    Parameters:
    - df: DataFrame to process
    - target_column: The target column for supervised feature selection
    - num_features: Number of top features to select
    
    Returns:
    - Transformed DataFrame with selected features and target column
    """
    if target_column not in df.columns:
        logging.error(f"Target column '{target_column}' not found in DataFrame.")
        return df

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Perform feature selection
    selector = SelectKBest(score_func=f_classif, k=num_features)
    X_new = selector.fit_transform(X, y)
    
    # Get selected feature names
    selected_columns = X.columns[selector.get_support()]
    logging.info(f"Selected top {num_features} features: {list(selected_columns)}")
    
    # Create a new DataFrame with the selected features and the target column
    df_selected = pd.DataFrame(X_new, columns=selected_columns)
    df_selected[target_column] = y.reset_index(drop=True)
    
    return df_selected

# Specify the target column for feature selection
target_column = 'target'  # Replace with your actual target column

if df is not None and target_column in df.columns:
    df_selected = select_best_features(df, target_column, num_features=5)
    display(df_selected.head())


In [None]:
if df is not None:
    BasicDataDHandelFunctions.save_dataset_csv(df)