In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline


In [3]:
df1 = pd.read_csv('./ga_sessions.csv',low_memory=False)
df2 = pd.read_csv('./ga_hits.csv',low_memory=False)

In [5]:
def df_merge(df1,df2):
    filtered_df2 = df2[df2['session_id'].isin(df1['session_id'])]
    df = pd.merge(df1, filtered_df2, on='session_id')
    return df

In [6]:
class TargetProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, target_type, target_events):
        self.target_type = target_type  
        self.target_events = target_events 
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        df = X.copy()
        
        if self.target_type == 'churn':
            df['target'] = df['event_action'].isin(self.target_events).astype(int)
        else:
            if not df['event_action'].isin(self.target_events).any():
                raise ValueError(f"Нет целевых событий {self.target_events} в данных!")
            
            
            has_target = df['event_action'].isin(self.target_events)
            
            target_sessions = set(df.loc[has_target, 'session_id'].unique())
            
            df['target'] = df['session_id'].isin(target_sessions).astype(int)
            
        return df    

In [7]:
def df_nancleaner(df):
    df = df.replace('(not set)', np.nan)
    df = df.replace('(none)', np.nan)
    missing_percent = df.isna().mean()  
    columns_to_drop = missing_percent[missing_percent > 0.2].index
    df = df.drop(columns=columns_to_drop)
    df = df.dropna()
    return df

In [8]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    
    def fit(self, df, y=None):
        return self
        
    def transform(self, df):
       
        
        
        df['visit_datetime'] = pd.to_datetime(df['visit_date'] + ' ' + df['visit_time'])
        df['hour'] = df['visit_datetime'].dt.hour
        df['day_of_week'] = df['visit_datetime'].dt.dayofweek
                
        
        session_stats = df.groupby('session_id').agg({
            'hit_number': ['count', 'max'],
            'event_action': lambda x: x.nunique()
        }).reset_index()
        session_stats.columns = ['session_id', 'event_count', 'max_hit_number', 'unique_actions']
        df = df.merge(session_stats, on='session_id', how='left')
        
        
        user_stats = df.groupby('client_id').agg({
            'session_id': 'nunique',
            'event_count': 'mean'
        }).rename(columns={'session_id': 'user_sessions', 'event_count': 'avg_events'})
        df = df.merge(user_stats, on='client_id', how='left')
        
        return df

In [9]:
class RareCategoryProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0.001):
        self.threshold = threshold
        self.rare_categories = {}
        
    def fit(self, df, y=None):
        for col in df.columns:
            value_counts = df[col].value_counts(normalize=True)
            self.rare_categories[col] = value_counts[value_counts < self.threshold].index
        return self
        
    def transform(self, df):
        df_copy = df.copy()
        for col in df.columns:
            if col in self.rare_categories:
                df_copy[col] = df_copy[col].replace(self.rare_categories[col], 'other')
        return df_copy

In [10]:


class HitPagePathTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            return X['hit_page_path'].apply(lambda x: len(x.split('?')[0].split('/')))
        else:
            return np.array([len(x.split('?')[0].split('/')) for x in X])


class SafeTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, smoothing=1.0):
        self.cols = cols
        self.smoothing = smoothing
        self.encoder = None
        
    def fit(self, X, y):
        self.encoder = TargetEncoder(cols=self.cols, smoothing=self.smoothing)
        self.encoder.fit(X[self.cols], y)
        return self
        
    def transform(self, X):
        X = X.copy()
        X[self.cols] = self.encoder.transform(X[self.cols])
        return X



In [11]:
MODEL_CONFIGS = {
    'lead_conversion': {
        'target_events': ['phone_entered', 'start_chat', 'sub_submit_success'],
    },
    'engagement': {
        'target_events': ['go_to_car_card', 'photos_all', 'search_form_search_btn'],
    },
    'upsell': {
        'target_events': ['click_pos_credit', 'click_insurance'],
    },
    'churn': {
        'target_events': ['search_form_clear'],
    }
}

In [14]:
def build_pipeline(model_type):
    numeric_features = ['visit_number', 'hit_number', 'hour', 'day_of_week', 
                       'event_count', 'max_hit_number', 'unique_actions', 
                       'user_sessions', 'avg_events']
    categorical_features = ['utm_source', 'utm_campaign', 'utm_adcontent',
                           'device_browser', 'geo_city']
    other_cat_features = ['device_category']
    
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('hit_path', Pipeline([
                ('path_processor', HitPagePathTransformer()),
                ('scaler', StandardScaler())
            ]), ['hit_page_path']),
            ('cat_target', SafeTargetEncoder(
                cols=categorical_features,
                smoothing=1.0
            ), categorical_features),
            ('cat_onehot', OneHotEncoder(handle_unknown='ignore', max_categories=100), 
             other_cat_features)
        ],
        remainder='drop'
    )

    
    base_models = [
        ('gbdt', GradientBoostingClassifier(
            random_state=42,
            n_estimators=100,
            max_depth=3
        )),
        ('xgb', XGBClassifier(
            random_state=42,
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            eval_metric='logloss',
            use_label_encoder=False
        )),
        ('catboost', CatBoostClassifier(
            random_state=42,
            iterations=100,
            depth=3,
            silent=True,
            thread_count=1
        ))
    ]
    
    
    meta_model = LogisticRegression(
        penalty='l2',
        C=0.1,
        random_state=42,
        max_iter=1000
    )
    
   
    pipeline = ImbPipeline([
        ('feature_engineer', FeatureEngineer()),
        ('preprocessor', preprocessor),
        ('undersampler', RandomUnderSampler(random_state=42)),
        ('stacking', StackingClassifier(
            estimators=base_models,
            final_estimator=meta_model,
            stack_method='auto',
            passthrough=False,
            n_jobs=-1
        ))
    ])
    
    return pipeline

In [12]:
def train_model(df1,df2, model_type):
    df = df_merge(df1,df2)
    target_processor = TargetProcessor(
        target_type=model_type,
        target_events=MODEL_CONFIGS[model_type]['target_events']
    )
    data_with_target = target_processor.transform(df.copy())
    
    data_without_nan = df_nancleaner(data_with_target)
   
    y = data_without_nan['target']  
    data_without_target = data_without_nan.drop(columns=['target'])
    
    X_train, X_test, y_train, y_test = train_test_split(
        data_without_target, y, test_size=0.2, random_state=42, stratify=y
    )
    
    pipeline = build_pipeline(model_type) 
    
    pipeline.fit(X_train, y_train)
    
    return pipeline, X_test, y_test
    
   
 

In [None]:
pipeline, X_test, y_test = train_model(df1,df2, model_type='lead_conversion')

In [None]:
y_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
print(f"Test ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:") 
print(confusion_matrix(y_test, y_pred))