# Imports
***

In [1]:
# Basic libraries
import collections
import numpy as np
import pandas as pd
import time

# Sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

#LGBM
from lightgbm import LGBMClassifier

# System
from os.path import join

  from numpy.core.umath_tests import inner1d


# Preprocessing the data
***

## Function definitions

In [2]:
def load_data(path, subsampling=None, seed=17):
    if subsampling:
        data = pd.read_csv(path).sample(n=subsampling, random_state=seed)

    else:
        data = raw_data = pd.read_csv(path)

    return data

In [3]:
def binarize(X:pd.DataFrame, columns):
    """Takes the X dataframe and binarizes given columns (transforms n given, not necessarily binary columns into m 
    binary columns, where m is not smaller than n)
    """

    output = X.copy()
    for col in columns:
        binned_values = pd.get_dummies(output[col], drop_first=True)
        binned_names = [col + '_' + str(x) for x in binned_values.columns]
        binned_values.columns = binned_names
        output = pd.concat([output, binned_values], axis=1)
        output.drop(col, inplace=True, axis=1)

    return output

In [4]:
def encode(X):
    """Takes the X dataframe and transforms values inside it into numeric type
    """
    encoder = LabelEncoder()
    output = X.copy()
    for col in output.columns:
        encoder.fit(output[col])
        output[col] = encoder.transform(output[col])

    return output

In [5]:
def imput(df, by_cols, to_col, model=None):
    """Takes the df dataframe and imputs column given in to_col, based on values from columns given in by_cols. Optionally
    the classifier to be used might be specified, if not KNN with k=100 is used.
    """
    def imput_snippet(X, y, model):
        no_nan_indeces = y.dropna().index
        nan_indeces = y[y.isna()].index

        model.fit(X.iloc[no_nan_indeces],
                y.iloc[no_nan_indeces])

        inputed_values = model.predict(X.iloc[nan_indeces])

        return inputed_values

    output = df.copy().reindex()

    useful_idx = output[by_cols].dropna().index
    to_input = output[to_col].iloc[useful_idx]
    if model is not None:
        clf = model
    else:
        clf = KNeighborsRegressor(n_neighbors=100)

    nan_indeces = output.iloc[useful_idx][output[to_col].iloc[useful_idx].isnull()].index

    output[to_col].iloc[nan_indeces] = imput_snippet(output[by_cols].iloc[useful_idx],
                                        to_input,
                                        clf)

    return output

In [6]:
def age_to_bins(age):
    """Degranulates the age columns into multiplications of 10
    """
    return age%10

In [20]:
def fix_target(data):
    """Fixes the wrongly assigned Target values. Target value (poverty level) should be the same for all habitants in a
    household (denoted by column idhogar), so any row where the Target value is different from household's 
    head (denoted by column parentesco1==1) should be changed to be the same as the household's head's Target value
    """
    def has_head(row):
        same_household = data['idhogar'] == row['idhogar']
        is_head = data['parentesco1'] == 1
        return (same_household & is_head).any()

    def get_true_target(row):
        if pd.isnull(row['Target']):
            return np.nan
        else:
            same_household = data['idhogar'] == row['idhogar']
            is_head = data['parentesco1'] == 1
            head = data[same_household & is_head]
            return int(head['Target'])

    df = data.copy()
    df = df[df.apply(has_head, axis=1)]

    not_head = df['parentesco1'] != 1
    df.set_value(index=not_head[not_head].index,
                 col='Target',
                 value=df[not_head].apply(get_true_target, axis=1))
    return df

In [8]:
def numerate_yes_nos(row):
    """Transform string values 'yes' into 1's (int) and string values 'no' into 0's (int). If other value is given returns 
    float conversion of it.
    """
    if row == 'yes':
        return 1
    elif row == 'no':
        return 0
    else:
        return float(row)

In [9]:
def is_iterable(x):
    """Returns True if the given value is an iterable, but not a string
    """
    if isinstance(x, str):
        return False
    elif isinstance(x, collections.Iterable):
        return True
    else:
        return False

In [10]:
def select_features(X, y, n=None, verbose=0):
    """From the X dataframe returns n columns (features) that give best results in predicting values in y. RFE algorithm is 
    used for feature selection. Verbose paramter controls amount of response given by the algorithm.
    """
    selector = RFE(RandomForestClassifier(n_estimators=150), n_features_to_select=n, verbose=verbose, step=2)
    selector.fit(X, y)

    return X.columns[selector.support_]

In [11]:
def preprocess(directory: str=None, filenames: str=None, to_binarize=True, to_numerate=True, to_imput=True,
               to_select_feats = True):
    """Takes directory and filename of the training data. Return preprocessed data separated into X and y pd.Dataframes
    
    :str directory:
    :str filename:
    :bool to_binarize:
    :bool to_numerate:
    :bool to_imput:
    :bool to_select_feats:
    :return X:pd.Dataframe, y:pd.Dataframe:
    """
    if directory is None:
        directory = ''
    if not is_iterable(filenames):
        names = [filenames]
    else:
        names = filenames

    # Read the csv file
    if is_iterable(filenames):
        data = pd.DataFrame([])
        for name in names:
            tmp = pd.read_csv(name)
            if 'Target' not in tmp.columns:
                tmp['Target'] = np.nan
            data = pd.concat([tmp, data])
    else:
        data = pd.read_csv(join(directory, filenames))
        if 'Target' not in data.columns:
            data['Target'] = np.nan


    # Shuffle the data and reset the index
    data = fix_target(data)
    data = data.sample(frac=1, random_state=17).reset_index()


    # Split the data into dependent and independent vars and ids
    X: pd.DataFrame = data.drop(['Target', 'Id'], axis=1).copy()
    y: pd.Series = data['Target'].copy()
    ids: pd.Series = data['Id']


    # Convert data to numeric, where possible
    if to_numerate is True:
        le = LabelEncoder()
        le.fit(X['idhogar'])

        X.loc[:, 'idhogar'] = le.transform(X['idhogar'])
        X.loc[:, 'edjefa'] = X['edjefa'].apply(numerate_yes_nos)
        X.loc[:, 'edjefe'] = X['edjefe'].apply(numerate_yes_nos)
        X.loc[:, 'dependency'] = X['dependency'].apply(numerate_yes_nos)
        X.loc[:, 'age'] = X['age'].apply(age_to_bins)

        print("Number of columns in X: ", len(X.columns))


    # Imput nan values
    if to_imput is True:
        X.update(imput(X, ['SQBescolari'], 'meaneduc'), overwrite=True)
        X.update(imput(X, ['meaneduc'], 'SQBmeaned'), overwrite=True)
        X.update(imput(X, ['rooms', 'meaneduc','SQBmeaned', 'SQBedjefe'], 'v2a1'), overwrite=True)
        X.update(imput(X, ['agesq', 'SQBage','age'], 'rez_esc', model=KNeighborsClassifier(n_neighbors=40)), overwrite=True)
        X.loc[X.v18q1.isnull(), 'v18q1'] = 0

        print("Number of columns in X: ", len(X.columns))


    # Drop nan columns
    X = X.dropna(axis=1)


    # Drop non numeric columns
    acc_dtypes = [np.int64, np.float64]
    is_numeric = X.dtypes.apply(lambda x: x in acc_dtypes)
    X = X[is_numeric.index[is_numeric]]



    # Binarize the classes
    if to_binarize is True:
        cols_to_binarize = [x for x in X.columns if len(X[x].value_counts()) < 10]
        X = binarize(X, cols_to_binarize)

        print("Number of columns in X: ", len(X.columns))



    # Feature selection
    if to_select_feats is True:
        selected_columns = select_features(X[y.notnull()], y[y.notnull()], n=200)
        X = X[selected_columns]

        print("Number of columns in X: ", len(X.columns))


    return X, y, ids

# Model creation and evaluation
***

## Function definition

In [12]:
def get_model():
    model = LGBMClassifier(n_estimators=800)
    return model

In [18]:
def cv_validate():
    filename = 'train.csv'
    X, y, ids = preprocess(filenames=[filename], to_binarize=True, to_select_feats=False)
    print("Data loaded")
    model = get_model()
    cross_f1 = cross_val_score(model, X, y, scoring='f1_macro', cv=5)
    return cross_f1

In [14]:
def predict():
    train_filename = 'train.csv'
    test_filename = 'test.csv'
    filenames = train_filename
    print("Preprocessing the data")
    # X, y, ids = preprocess(filenames=filenames)
    # test_idx = y[y.isnull()].index
    # train_idx = y[y.notnull()].index
    # X_train = X.iloc[train_idx]
    # y_train = y.iloc[train_idx].astype(int)
    # X_test = X.iloc[test_idx]
    # test_ids = ids.iloc[test_idx]
    X_train, y_train, ids_train = preprocess(filenames=train_filename, to_binarize=False, to_select_feats=False)
    X_test, y_test, ids_test = preprocess(filenames=test_filename, to_binarize=False, to_select_feats=False)
    X_test = X_test[X_train.columns.values]

    head_idx = X_train['parentesco1'] == 1
    reference = y_train[head_idx].copy()
    reference.index = X_train[head_idx]['idhogar'].copy()
    y_train = pd.concat([y_train, X_train], axis=1).apply(lambda x: fix_target(x, reference), axis=1)

    selected_features = select_features(X_train, y_train, 100, verbose=1)
    X_train = X_train[selected_features]
    X_test = X_test[selected_features]

    model = get_model()
    print("Training the model")
    model.fit(X_train, y_train)
    print("Predicting values")
    preds_arr = model.predict(X_test)
    preds = pd.DataFrame({'Id' : ids_test,
                       'Target' : preds_arr})
    print("Prediction done")
    
    return preds

In [15]:
def export_preds(filename=None):
    preds:pd.DataFrame = predict()
    if filename is None:
        filename = 'predictions.csv'
    preds.to_csv(filename, index=False, header=True)
    print("File saved")

In [21]:
if __name__ == '__main__':
    print("Script started on: ", time.asctime())
    score = cv_validate()

Script started on:  Tue Aug 21 15:00:56 2018




Number of columns in X:  142


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Number of columns in X:  142
Number of columns in X:  196


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [24]:
print("Cross validated, macro averaged f1 score of the prediction is {:.2f}%".format(100*np.mean(score)))

Cross validated, macro averaged f1 score of the prediction is 95.88%
