In [76]:
import locale
import pandas as pd
import urllib.request
import zipfile
from pathlib import Path

In [103]:
from feature_engine import encoding, imputation
from sklearn import base, pipeline

In [77]:
locale.setlocale(locale.LC_ALL, "es_ES.UTF-8")
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:,.1f}".format)

### Set Variables and Paths

*Variables*

In [92]:
url: str = (
    "https://github.com/mattharrison/datasets/raw/master/data/" "kaggle-survey-2018.zip"
)
folder_name: str = "kaggle-survey-2018.zip"
member_name: str = "multipleChoiceResponses.csv"

*Paths*

In [93]:
HOME: Path = Path.cwd().parents
data_folder: Path = HOME[1] / f"data/raw/{folder_name}"

### Helper Funcs

In [82]:
def extract_zip(src: str, dst: str, member_name: str) -> pd.DataFrame:
    """Extract a member file from a zipfile and read it into a pandas
    DataFrame

    Args:
        src: str 
            Url of the zip file to be download and extracted.
        dst: Path, str, 
            Local file path where the zip file will be written.
        member_name: str
            Name of the member file inside the zip file
            to be read into a DataFrame.

    Returns:
        pandas.DataFrame: DataFrame containing the contents of the
        member file
    """
    url = src
    data_folder = dst
    fin = urllib.request.urlopen(url)
    data = fin.read()
    with open(dst, mode="wb") as fout:
        fout.write(data)
    with zipfile.ZipFile(dst) as z:
        kag = pd.read_csv(z.open(member_name), low_memory=False)
        kag_questions = kag.iloc[0]
        raw = kag.iloc[1:]
        return raw

In [97]:
def tweak_kag(df_: pd.DataFrame) -> pd.DataFrame:
    """
    Tweak the kaggle survey data and return a new DataFrame.

    This function takes a Pandas Dataframe containing Kaggle
    survey data as input and returns a new DataFrame. The modifications include extracting
    and trasforming certian columns, renaming columns, and selecting subset of columns.

    Args:
        df_: pd.DataFrame 
            The input DataFrame containing kaggle survey data.

    Returns:
        pd.DataFrame 
            the new DataFrame with the modified and selected columns.
    """
    return (
        df_
        .assign(ag=df_.Q2.str.slice(0, 2).asstype(int),
                education=df_.Q4.replace({
                    "Master's degree": 18,
                    "Bachelor's degree": 16,
                    "Doctoral Degree": 20,
                    "Some college/university study without earning a bachelor's degree": 13,
                    "Professional degree": 19,
                    "I prefer not to answer": None,
                    "No formal education past high school": 12}),
                major=(
            df_.Q5
            .pipe(top_n, n=3)
            .replace({
                "Computer science (software engineering, etc.)": 'cs',
                "Engineering (non-computer focused)": 'eng',
                "Mathematics or statistics": "stat"
            })),
            years_exp=(
                df_.Q8.str.replace('+', '', regex=False)
                .str.split("-", expand=True)
                .iloc[:, 0]
                .astype(float)),
            compensation=(
                df_.Q9.str.replace('+', '', regex=False)
                .str.replace(',', '', regex=False)
                .str.replace('500000', '500', regex=False)
                .str.replace("I do not wish to disclose my approximate yearly compensation", 
                '0', regex=False)
                .str.split('-', expand=True)
                .iloc[:, 0]
                .fillna(0)
                .astype(int)
                .mul(1_000)),
            python=df_.Q16_Part_1.fillna(0).replace('Python', 1),
            r=df_.Q16_Part_2.fillna(0).replace('r', 1),
            sql=df_.Q16_Part_3.fillna(0).replace("SQL", 1)
        ) # Assign
        .rename(columns=lambda col: col.replace('', '_'))
        .loc[: 'Q1, Q3, education, major, years_exp, compensation,'
        'python, r, sql'.split(',')]
    )


In [99]:
def top_n(ser: pd.Series, n:int=5, default:str='other') -> pd.Series:
    """
    Replace all values in a Pandas Series that are not among
    the top `n` most frequent values with a default value.

    This function takes a Pandas Series and returns a new
    Series with the values replaced as described above. The
    top `n` most frequent values are determined using the
    `value counts`method of the input Series.

    Args:
        ser: Pd.Series,
            The input Series.
        n: int, optional, default `5`
            The number of most frequent values to keep.
        default: str, default `other`, optional
            The default values to use for values that are not among
            the top `n`mos frequent values.

    Returns:
        pd.Series
            The modified Series with the values replaced.
    """
    counts = ser.value_counts()
    return ser.where(ser.isin(counts.index[:n], default))

In [109]:
def get_rawX_y(df:pd.DataFrame, y_col):
    raw = (
        df
        .query("Q3.isin(['United States of America', 'China, 'India']"
        "and Q6.isin(['Data Scientist', 'Software Engineer])")
    )
    return raw.drop(columns=[y_col]), raw[y_col]

### Classes

In [111]:
class TweakKagTransformer(base.BaseEstimator, base.TransformerMixin):
    """
    A trasformer for tweaking Kaggle survey data.

    This trasformer takes a Pandas DataFrame containing
    Kaggle survey data as input and returns a new version of
    the DataFrame. The modifications include extracting and
    trasforming certain columns, renaming columns, and
    selecting a subset of columns.

    Args:
        ycol: str, optional
            The name of the column to be used as the target variables.
            If not specified, the target variable will not be set.
            
    Attributes:
        ycol: str
            The name of the column to be used as the target variable.
    """

    def __init__(self, ycol=None):
        self.ycol = ycol

    def transform(self, X):
        return tweak_kag(X)

    def fit(self, X, y=None):
        return self

#### Datasets

In [90]:
raw = extract_zip(url, data_folder, member_name)

#### Create a pipeline

In [112]:
kag_pl = pipeline.Pipeline(
    [('tweak', TweakKagTransformer()),
    ('cat', encoding.OneHotEncoder(top_categories=5, drop_last=True,
    variables=['Q1', 'Q3', 'major'])),
    ('num_impute', imputation.MeanMedianImputer(imputation_method='median',
    variables=['education', 'years_exp']))]
)