In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Load your dataset
df = pd.read_csv("../data/raw/Lead Scoring.csv")
df=df.head(5)
df.to_csv('head_of_Lead_Scoring.csv')

In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

# Enable pandas output from transformers
set_config(transform_output="pandas")

# Define global binary columns
binary_cols = [
    'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle',
    'xeducationforums', 'newspaper', 'digitaladvertisement',
    'throughrecommendations', 'receivemoreupdatesaboutourcourses',
    'updatemeonsupplychaincontent', 'getupdatesondmcontent',
    'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview'
]

# Helper functions (top-level, named)
def safe_lower_strip(x):
    if pd.isna(x) or str(x).strip().lower() in ['nan', 'select', '']:
        return 'missing'
    return str(x).strip().lower()

def clean_categorical_values_func(df):
    return df.applymap(safe_lower_strip)

def clean_ordinal_text(v):
    return str(v).strip().lower().replace('01.', '').replace('02.', '').replace('03.', '') if pd.notnull(v) else 'missing'

def clean_ordinal_values_func(x):
    return x.applymap(clean_ordinal_text)

def fix_common_bad_values_func(x):
    return x.replace({'nan': 'missing', 'select': 'missing'}, regex=True)

def convert_to_dataframe_with_binary_cols_func(x):
    return pd.DataFrame(x, columns=binary_cols)

def normalize_column_names_func(df):
    return df.rename(columns=lambda c: c.lower().replace(' ', '').strip())

# Convert the each column values to lowercase and handles unseen data and Missing data with Model
class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    """Label encoding for binary columns with casing and unseen value handling"""
    
    def __init__(self):
        self.le_dict = {}
        self.mode_dict = {}
        self.columns = None

    def fit(self, X, y=None):
        if not hasattr(X, 'columns'):
            raise ValueError("CustomLabelEncoder requires DataFrame input with column names")

        self.columns = X.columns.tolist()
        for col in self.columns:
            le = LabelEncoder()
            # Normalize values: lowercase + strip spaces
            X_normalized = X[col].astype(str).str.lower().str.strip()
            self.le_dict[col] = le.fit(X_normalized)
            self.mode_dict[col] = X_normalized.mode()[0]  # Save mode for fallback
        return self

    def transform(self, X):
        if not hasattr(X, 'columns'):
            if self.columns is None:
                raise ValueError("Column names not available for numpy array input")
            X = pd.DataFrame(X, columns=self.columns)
        
        X_encoded = X.copy()
        for col in self.columns:
            le = self.le_dict[col]
            mode_value = self.mode_dict[col]
            
            # Normalize test input
            X_encoded[col] = X_encoded[col].astype(str).str.lower().str.strip()
            
            # Handle unseen values by replacing with mode
            X_encoded[col] = X_encoded[col].apply(
                lambda val: val if val in le.classes_ else mode_value
            )
            
            # Re-transform after handling unknowns
            X_encoded[col] = le.transform(X_encoded[col])
        return X_encoded

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)
    
def get_feature_names(pipeline, numeric_cols, ordinal_cols, binary_cols, categorical_cols):
    col_transformer = pipeline.named_steps['transform']
    ohe = col_transformer.named_transformers_['cat'].named_steps['onehot']
    onehot_feature_names = list(ohe.get_feature_names_out(categorical_cols))
    return numeric_cols + ordinal_cols + binary_cols + onehot_feature_names

def get_full_preprocessing_pipeline(df):
    df = df.copy()
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '')

    numeric_cols = [
        'totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit',
        'asymmetriqueactivityscore', 'asymmetriqueprofilescore'
    ]

    ordinal_cols = ['asymmetriqueactivityindex', 'asymmetriqueprofileindex']
    ordinal_categories = [['missing', 'low', 'medium', 'high']] * 2

    id_columns = ['prospectid', 'leadnumber', 'converted']
    existing_ids = [col for col in id_columns if col in df.columns]
    if existing_ids:
        print(f"Dropping ID columns: {existing_ids}")
        df.drop(columns=existing_ids, inplace=True)

    categorical_cols = [
        'country', 'tags', 'leadsource', 'specialization', 'lastactivity',
        'lastnotableactivity', 'howdidyouhearaboutxeducation', 'city',
        'leadprofile', 'whatisyourcurrentoccupation', 'leadquality',
        'leadorigin', 'whatmattersmosttoyouinchoosingacourse'
    ]

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scaler', MinMaxScaler())
    ])

    ordinal_pipeline = Pipeline([
        ('clean_text', FunctionTransformer(clean_ordinal_values_func, validate=False)),
        ('fix_common_bad_values', FunctionTransformer(fix_common_bad_values_func, validate=False)),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=ordinal_categories))
    ])

    binary_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('to_df', FunctionTransformer(convert_to_dataframe_with_binary_cols_func, validate=False)),
        ('label_encoder', CustomLabelEncoder())
    ])

    categorical_pipeline = Pipeline([
        ('clean_values', FunctionTransformer(clean_categorical_values_func, validate=False)),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])

    column_transformer = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_cols),
            ('ord', ordinal_pipeline, ordinal_cols),
            ('bin', binary_pipeline, binary_cols),
            ('cat', categorical_pipeline, categorical_cols)
        ],
        remainder='drop',
        verbose_feature_names_out=False
    )

    full_pipeline = Pipeline([
        ('clean_column_names', FunctionTransformer(normalize_column_names_func, validate=False)),
        ('transform', column_transformer)
    ])

    return full_pipeline, numeric_cols, ordinal_cols, binary_cols, categorical_cols


from sklearn.model_selection import train_test_split

X=df.drop(columns=['Converted'])
y=df['Converted']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# 1. Create pipeline and get column groupings
pipeline, num_cols, ord_cols, bin_cols, cat_cols = get_full_preprocessing_pipeline(X_train)

# 2. Fit pipeline externally
pipeline.fit(X_train)

# 3. Transform the data
Xtrain_transformed = pipeline.transform(X_train)  # returns NumPy array

# 4. Get feature names
feature_names = get_feature_names(pipeline, num_cols, ord_cols, bin_cols, cat_cols)
print(feature_names, len(feature_names))

Dropping ID columns: ['prospectid', 'leadnumber']
['totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit', 'asymmetriqueactivityscore', 'asymmetriqueprofilescore', 'asymmetriqueactivityindex', 'asymmetriqueprofileindex', 'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle', 'xeducationforums', 'newspaper', 'digitaladvertisement', 'throughrecommendations', 'receivemoreupdatesaboutourcourses', 'updatemeonsupplychaincontent', 'getupdatesondmcontent', 'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview', 'country_asia/pacific region', 'country_australia', 'country_bahrain', 'country_bangladesh', 'country_belgium', 'country_canada', 'country_china', 'country_denmark', 'country_france', 'country_germany', 'country_ghana', 'country_hong kong', 'country_india', 'country_italy', 'country_kenya', 'country_kuwait', 'country_liberia', 'country_malaysia', 'country_missing', 'country_netherlands', 'country_nigeria', 'country_oman', 'country_philippines', 'cou

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

# Enable pandas output from transformers
set_config(transform_output="pandas")

### 1. Custom Transformers with Feature Names Support ###

class SafeFunctionTransformer(FunctionTransformer):
    """Wrapper for FunctionTransformer that supports get_feature_names_out"""
    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return np.array([f"func_{i}" for i in range(self.n_output_features_)])
        return np.asarray(input_features, dtype=object)

class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    """Label encoding with feature names support"""
    def __init__(self):
        self.le_dict = {}
        self.mode_dict = {}
        self.columns = None

    def fit(self, X, y=None):
        self.columns = X.columns.tolist()
        for col in self.columns:
            le = LabelEncoder()
            X_normalized = X[col].apply(safe_lower_strip)
            self.le_dict[col] = le.fit(X_normalized)
            self.mode_dict[col] = X_normalized.mode()[0]
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for col in self.columns:
            le = self.le_dict[col]
            mode_value = self.mode_dict[col]
            X_encoded[col] = X_encoded[col].apply(safe_lower_strip)
            X_encoded[col] = X_encoded[col].apply(
                lambda val: val if val in le.classes_ else mode_value
            )
            X_encoded[col] = le.transform(X_encoded[col])
        return X_encoded

    def get_feature_names_out(self, input_features=None):
        return np.array(self.columns)

### 2. Utility Functions ###

def safe_lower_strip(x):
    if pd.isna(x) or str(x).strip().lower() in ['nan', 'select', '']:
        return 'missing'
    return str(x).strip().lower()

def convert_to_numeric(x):
    try:
        return pd.to_numeric(x)
    except (ValueError, TypeError):
        return np.nan

def clean_ordinal_text(x):
    x = safe_lower_strip(x)
    return x.replace('01.', '').replace('02.', '').replace('03.', '')

def normalize_column_names(df):
    return df.rename(columns=lambda c: str(c).lower().replace(' ', '').strip())

def clean_categorical_values(df):
    return df.applymap(safe_lower_strip).replace({'nan': 'missing', 'select': 'missing'}, regex=True)

def clean_ordinal_values(df):
    return df.applymap(clean_ordinal_text)

def convert_numeric_values(df):
    return df.applymap(convert_to_numeric)

def clean_binary_values(df):
    return df.applymap(safe_lower_strip)

def remove_id_columns(df):
    id_columns = ['prospectid', 'leadnumber', 'converted']
    return df.drop(columns=[col for col in id_columns if col in df.columns])

### 3. Pipeline Construction ###

def get_full_preprocessing_pipeline():
    # Define column groups
    numeric_cols = [
        'totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit',
        'asymmetriqueactivityscore', 'asymmetriqueprofilescore'
    ]
    
    ordinal_cols = ['asymmetriqueactivityindex', 'asymmetriqueprofileindex']
    ordinal_categories = [['missing', 'low', 'medium', 'high']] * 2
    
    binary_cols = [
        'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle',
        'xeducationforums', 'newspaper', 'digitaladvertisement',
        'throughrecommendations', 'receivemoreupdatesaboutourcourses',
        'updatemeonsupplychaincontent', 'getupdatesondmcontent',
        'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview'
    ]
    
    categorical_cols = [
        'country', 'tags', 'leadsource', 'specialization', 'lastactivity',
        'lastnotableactivity', 'howdidyouhearaboutxeducation', 'city',
        'leadprofile', 'whatisyourcurrentoccupation', 'leadquality',
        'leadorigin', 'whatmattersmosttoyouinchoosingacourse'
    ]
    
    # Numeric pipeline
    numeric_pipeline = Pipeline([
        ('type_convert', SafeFunctionTransformer(convert_numeric_values, validate=False)),
        ('imputer', SimpleImputer(strategy='median')),
        ('log', SafeFunctionTransformer(np.log1p, validate=False)),
        ('scaler', MinMaxScaler())
    ])
    
    # Ordinal pipeline
    ordinal_pipeline = Pipeline([
        ('clean', SafeFunctionTransformer(clean_ordinal_values, validate=False)),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=ordinal_categories))
    ])
    
    # Binary pipeline
    binary_pipeline = Pipeline([
        ('clean', SafeFunctionTransformer(clean_binary_values, validate=False)),
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', CustomLabelEncoder())
    ])
    
    # Categorical pipeline
    categorical_pipeline = Pipeline([
        ('clean', SafeFunctionTransformer(clean_categorical_values, validate=False)),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ])
    
    # Column transformer
    column_transformer = ColumnTransformer(
        transformers=[
            ('num', numeric_pipeline, numeric_cols),
            ('ord', ordinal_pipeline, ordinal_cols),
            ('bin', binary_pipeline, binary_cols),
            ('cat', categorical_pipeline, categorical_cols)
        ],
        remainder='drop',
        verbose_feature_names_out=False
    )
    
    # Full pipeline
    full_pipeline = Pipeline([
        ('clean_names', SafeFunctionTransformer(normalize_column_names, validate=False)),
        ('remove_ids', SafeFunctionTransformer(remove_id_columns, validate=False)),
        ('transform', column_transformer)
    ])
    
    return full_pipeline

### 4. Feature Names Extraction ###

def get_feature_names(pipeline):
    """Extracts feature names from the fitted pipeline"""
    col_transformer = pipeline.named_steps['transform']
    
    # Get feature names from each transformer
    feature_sets = []
    
    for name, trans, cols in col_transformer.transformers_:
        if name == 'bin':
            # Binary features
            feature_sets.extend(
                col_transformer.named_transformers_['bin'].named_steps['encoder']
                .get_feature_names_out(cols)
            )
        elif hasattr(trans, 'get_feature_names_out'):
            # Standard transformers
            feature_sets.extend(trans.get_feature_names_out(cols))
        else:
            # Fallback for transformers without feature names
            feature_sets.extend(cols)
    
    return feature_sets

### 5. Example Usage ###

if __name__ == "__main__":
    # Sample data (replace with your actual data)
    # df = pd.read_csv('your_data.csv')
    
    # Create pipeline
    pipeline = get_full_preprocessing_pipeline()
    
    # Fit pipeline
    X = df.drop(columns=['Converted'])
    pipeline.fit(X)
    
    # Get feature names
    feature_names = get_feature_names(pipeline)
    print("Generated features:", feature_names)
    
    # Transform data
    transformed_data = pipeline.transform(X)
    print("Transformed data shape:", transformed_data.shape)
    

Generated features: ['totalvisits', 'totaltimespentonwebsite', 'pageviewspervisit', 'asymmetriqueactivityscore', 'asymmetriqueprofilescore', 'asymmetriqueactivityindex', 'asymmetriqueprofileindex', 'donotemail', 'donotcall', 'search', 'magazine', 'newspaperarticle', 'xeducationforums', 'newspaper', 'digitaladvertisement', 'throughrecommendations', 'receivemoreupdatesaboutourcourses', 'updatemeonsupplychaincontent', 'getupdatesondmcontent', 'iagreetopaytheamountthroughcheque', 'afreecopyofmasteringtheinterview', 'country_asia/pacific region', 'country_australia', 'country_bahrain', 'country_bangladesh', 'country_belgium', 'country_canada', 'country_china', 'country_denmark', 'country_france', 'country_germany', 'country_ghana', 'country_hong kong', 'country_india', 'country_indonesia', 'country_italy', 'country_kenya', 'country_kuwait', 'country_liberia', 'country_malaysia', 'country_missing', 'country_netherlands', 'country_nigeria', 'country_oman', 'country_philippines', 'country_qata

In [3]:
import pickle
# 2. Save the pipeline to current folder
with open('preprocessing_pipeline.pkl', 'wb') as f:
    pickle.dump({
            'pipeline': pipeline,
            'custom_classes': {
                'SafeFunctionTransformer': SafeFunctionTransformer,
                'CustomLabelEncoder': CustomLabelEncoder
            }
        }, f)

print("Pipeline saved as 'preprocessing_pipeline.pkl' in current directory")

Pipeline saved as 'preprocessing_pipeline.pkl' in current directory


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Load your dataset
df = pd.read_csv("../data/raw/Lead Scoring.csv")
df.head()


# Then when loading:
def load_pipeline(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
        # Add classes to current namespace
        globals().update(data['custom_classes'])
        return data['pipeline']
loaded_pipelines=load_pipeline("preprocessing_pipeline.pkl")

#data.fit(df)

AttributeError: Can't get attribute 'SafeFunctionTransformer' on <module '__main__'>

In [1]:
import os
from collections import defaultdict

def build_tree(path):
    tree = defaultdict(dict)

    for root, dirs, files in os.walk(path):
        parts = os.path.relpath(root, path).split(os.sep)
        current = tree
        for part in parts:
            if part != '.':
                current = current[part]
        for d in dirs:
            current[d] = {}
        for f in files:
            current[f] = None  # Use None to represent files
    return tree

def print_tree(tree, indent=0):
    for key, value in tree.items():
        print("    " * indent + f"📁 {key}" if isinstance(value, dict) else "    " * indent + f"📄 {key}")
        if isinstance(value, dict):
            print_tree(value, indent + 1)

# 📌 Example usage
if __name__ == "__main__":
    project_root =  "C:/Users/Minfy/Documents/Minfy 2nd Month Data+AI/SalesConversions"  # 👈 Replace with actual path
    project_tree = build_tree(project_root)
    print("📦 Project Structure:")
    print_tree(project_tree)


📦 Project Structure:
📁 config
    📄 airflow_dag_config.json
    📄 aws_config.yaml
    📄 config.yaml
📁 dags
    📁 __pycache__
        📄 full_pipeline.cpython-38.pyc
        📄 monitor_pipeline.cpython-38.pyc
        📄 retrain_pipeline.cpython-38.pyc
    📄 full_pipeline.py
    📄 monitor_pipeline.py
    📄 retrain_pipeline.py
    📄 train_pipeline.py
📁 data
    📁 artifacts
        📄 fitted_pipeline.pkl
    📁 external
    📁 processed
        📄 eda_cleaned.csv
    📁 raw
        📄 Lead Scoring.csv
        📄 LeadScoring.csv
    📄 fitted_pipeline.pkl
    📄 preprocessing_pipeline.pkl
📁 data_ingestion
    📄 from_postgres.py
📁 deployment
    📁 docker
    📁 flask_app
        📁 templates
            📄 form.html
        📄 app.py
    📄 sagemaker_deploy.py
📁 logs
    📁 dag_processor_manager
        📄 dag_processor_manager.log
    📁 scheduler
        📁 2025-07-18
            📄 full_pipeline.py.log
            📄 monitor_pipeline.py.log
        📄 latest
📁 ml
    📁 __pycache__
        📄 bestmodel_to_producti