In [378]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
pd.options.mode.chained_assignment = None

ModuleNotFoundError: No module named 'category_encoders'

In [6]:
def drop_null_cols(df_, columns_to_check=[], threshold=0):
    '''
    Drop columns with null values above a threshold if specified.
    
    Parameters:
        df_ : DataFrame or 2-dimensional array
        columns_to_check : list or array-like, optional
            List specifying column(s) to check for null values. Default is
            every column.
        threshold : int or float, optional
            Max number/percent of null values allowed in {columns_to_check}.
            Default is 0.
    Returns:
        df_ : DataFrame
    '''
    if not isinstance(df_, pd.DataFrame):
        df_ = pd.DataFrame(df_)
    if not isinstance(columns_to_check, list):
        columns_to_check = list(columns_to_check)
    if not columns_to_check:
        columns_to_check = df_.columns
    rows_size = df_.shape[0]
    # Drops columns that have null values above the specified threshold
    if isinstance(threshold, int):
        new_df = df_[columns_to_check].dropna(axis=1, thresh=rows_size-threshold)
    elif threshold <= 1.0 and threshold >= 0.0:
        new_df = df_[columns_to_check].dropna(axis=1, thresh=rows_size-int(rows_size*threshold))
    else:
        raise TypeError(f'{threshold} of wrong type or out of range')
    df_.drop(columns=np.setdiff1d(columns_to_check, new_df.columns), inplace=True)
    return df_

In [7]:
def drop_null_rows(df_, columns_to_check=[], threshold=0):
    '''
    Drop rows with null values above a threshold if specified.
    
    Parameters:
        df_ : DataFrame or 2-dimensional array
        columns_to_check : list or array-like, optional
            List specifying column(s) to check for null values. Default is
            every column.
        threshold : int or float, optional
            Max number/percent of null values allowed in {columns_to_check}
            of every row. Default is 0.
    Returns:
        df_ : DataFrame
    '''
    if not isinstance(df_, pd.DataFrame):
        df_ = pd.DataFrame(df_)
    if not isinstance(columns_to_check, list):
        columns_to_check = list(columns_to_check)
    if not columns_to_check:
        columns_to_check = df_.columns
    cols_size = len(columns_to_check)
    # Drop rows that have null values above the specified threshold
    if isinstance(threshold, int):
        df_.dropna(axis=0, thresh=cols_size-threshold,
                   subset=columns_to_check, inplace=True)
    elif threshold <= 1.0 and threshold >= 0.0:
        df_.dropna(axis=0, thresh=cols_size-int(cols_size*threshold),
                   subset=columns_to_check, inplace=True)
    else:
        raise TypeError(f'{threshold} of wrong type or out of range')
    return df_
    

In [374]:
def fill_null(col_, const_=None, col_replace_=None, lin_predict=None, oper_="", mode_=False,
                        range_=False, std_=False, col_math_=None):
    '''
    Fill null in {col_replace_} with {constant} or a transformation of {col_math_}.
    
    Parameters:
        col_ : list or array-like
            Column with null values to replace.
        const_ : int or float, optional
            Value to replace null in {col_} with. Default is 0.
        col_replace_ : list or array-like or Series, optional
            Values to replace null values in {col_}, must be of equal dimension.
            Default is 0.
        lin_predict_ : DataFrame, optional
            Replaces null values in {col_} with predictions from a
            linear regression model trained on the non-null values.
            Will drop rows with null values not in {col_} before training.
        oper_ : str, optional
            Replaces null values in {col_} with the operation specified
            for non-null in {col_math_}.
        col_math_ : str or list or array-like, optional
            Column that will be operated upon by one of the above to fill
            the null values in {col_}. Default is {col_}.
    Returns:
        df_ : DataFrame
    '''
    if not isinstance(col_, pd.Series):
        col_ = pd.Series(col_)
    # If no other parameters are passed, nulls will be set to 0.
    if (const_ is None and col_replace_ is None and lin_predict is None and
        oper_=="" and col_math_ is None):
        col_.fillna(0, inplace=True)
        return col_
    # Nulls will be set to const_.
    if const_:
        col_.fillna(const_, inplace=True)
        return col_
    # Nulls will be set to a list-like structure of equal length.
    if col_replace_ is not None:
        try:
            col_.loc[col_.isnull()] = list(col_replace_)
            return col_
        except ValueError:
            raise ValueError(f"Can't set length of {len(col_replace_)} to index of length {len(col_.loc[col_.isnull()])}")
    # Nulls will be predicted using linear regression.
    if lin_predict is not None:
        notnull_df_ = lin_predict.dropna()
        linreg = LinearRegression()
        train_x = notnull_df_.drop(columns=[col_.name])
        train_y = notnull_df_[col_.name]
        linreg.fit(train_x, train_y)
        test_y = linreg.predict(lin_predict.drop(columns=[col_.name]))
        col_.fillna(pd.Series(test_y), inplace=True)
        return col_
    # Nulls will be set to the mean of col_math_
    if col_math_ is None:
        col_math_ = col_
    if oper_.lower()=="mean":
        col_.fillna(col_math_.mean(), inplace=True)
        return col_
    if oper_.lower()=="median":
        col_.fillna(col_math_.median(), inplace=True)
        return col_
    if oper_.lower()=="mode":
        col_.fillna(stats.mode(col_math_)[0][0], inplace=True)
        return col_
    if oper_.lower()=="std":
        col_.fillna(col_math_.std(), inplace=True)
        return col_
    if oper_.lower()=="min":
        col_.fillna(col_math_.min(), inplace=True)
        return col_
    if oper_.lower()=="max":
        col_.fillna(col_math_.max(), inplace=True)
        return col_
    if oper_.lower()=="range":
        col_.fillna(col_math_.max()-col_math_.min(), inplace=True)
        return col_
    if oper_.lower()=="25":
        col_.fillna(col_math_.describe()["25%"], inplace=True)
        return col_
    if oper_.lower()=="75":
        col_.fillna(col_math_.describe()["75%"], inplace=True)
        return col_
    

In [None]:
def category_encode():
    

In [375]:
total_df = pd.read_csv('../train.csv')
#total_df.isnull()["Embarked"].describe()
total_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [376]:
#drop_null_cols(total_df, ["Age", "Embarked", "Sex"], threshold=2)
#drop_null_rows(total_df, ["Age", "Embarked", "Sex"], threshold=0)
total_df["Age"] = fill_null(total_df["Age"], lin_predict=total_df[["Survived", "Pclass", "Age",
                                                                 "SibSp", "Parch", "Fare"]])
total_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,23.301264,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C
