In [23]:
from sklearn.pipeline import Pipeline
import pandas as pd
import seaborn as sns
import sys
sys.path.append("../scripts")


from clean_data import DataCleaner
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator,TransformerMixin

In [8]:
cleaner= DataCleaner()
data_name="AdSmartABdata.csv"

In [35]:
class ReadData(BaseEstimator,TransformerMixin):
    def fit(self,X=None,y=None):
        return self
    def transform(self,X=None):
        df = pd.read_csv(f'../data/{data_name}')
        return df

In [36]:
class CleanData():
    def fit(self,df,y=None):
        return self
    def transform(self,df:pd.DataFrame):

        cleaned_df=cleaner.drop_unresponsive(df)
        
        # control_df=cleaned_df[cleaned_df['experiment']=='control']
        # exposed_df=cleaned_df[cleaned_df['experiment']=='exposed']

        combined_df=cleaned_df.drop(columns=['no'])
        combined_df.rename(columns={'yes':'response'},inplace=True)
        return combined_df

In [37]:
# 1st step Split data

class DataSplit():

    def fit(self,df,y=None):
        return self
        
    def transform(self,df:pd.DataFrame):

        browser_df=df.loc[:,df.columns!='platform_os']
        platform_df=df.loc[:,df.columns!='browser']
        
        return browser_df,platform_df

In [43]:
# 2nd step Encode Categorical
class Feature_Encodder():
    def fit(self,df,y=None):
        return self

    def transform(self,df:tuple):

        browser_df,platform_df=df
        encoder = LabelEncoder()
        
        browser_df['date'] = encoder.fit_transform(browser_df['date'])
        browser_df['device_make'] = encoder.fit_transform(browser_df['device_make'])
        browser_df['browser'] = encoder.fit_transform(browser_df['browser'])
        browser_df['experiment'] = encoder.fit_transform(browser_df['experiment'])
        browser_df['response'] = encoder.fit_transform(browser_df['response'])
        
        platform_df['date'] = encoder.fit_transform(platform_df['date'])
        platform_df['device_make'] = encoder.fit_transform(platform_df['device_make'])
        platform_df['experiment'] = encoder.fit_transform(platform_df['experiment'])
        platform_df['response'] = encoder.fit_transform(platform_df['response'])

        return browser_df,platform_df

In [47]:

class Feature_Selection():
    def fit(self,df,y=None):
        return self

    def transform(self,df:tuple):

        browser_df,platform_df=df
        browser_df=browser_df.drop(columns="auction_id")
        platform_df=platform_df.drop(columns="auction_id")
        

        return browser_df,platform_df

In [None]:
class TestSplit():
    def fit(self,df,y=None):
        return self

    def transform(self,df:tuple):

        browser_df,platform_df=df

        X_browser = browser_df.iloc[:,:-1]
        y_browser = browser_df.iloc[:,-1:]

        X_platform = platform_df.iloc[:,:-1]
        y_platform = platform_df.iloc[:,-1:]

        browser_df_set=X_browser,y_browser
        platform_df_set=X_platform,y_platform
        

        return browser_df_set,platform_df_set

In [48]:
pipline = Pipeline([("loader", ReadData()),
                    ("cleaner", CleanData()),
                    ("spliter", DataSplit()),
                    ("feature_encodder", Feature_Encodder()),
                    ("feature_selector", Feature_Selection()),
                    ("train_test", TestSplit())])

strat_train_set = pipline.fit_transform(X=None)
