In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from lightgbm import LGBMRegressor
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
#from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.manifold import TSNE
import gc
from optuna.integration.lightgbm import LightGBMTunerCV
#from sklearn.model_selection import KFold

In [2]:
def imputation(df):
    for col in df:
        if(df[col].dtype=='object'):
            df[col].replace(np.nan,df[col].mode().iloc[0],inplace=True)
        else:
            df[col].replace(np.nan,df[col].mean(),inplace=True)
    print("Imputaion Done")
    return df

def label_encoder(df): #based on https://www.geeksforgeeks.org/ml-one-hot-encoding-of-datasets-in-python/?ref=rp
    le = LabelEncoder()
    le_count = 0
    for col in df: 
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
            #if len(list(df[col].unique())) <= 2:   #uncomment this line to encode columns with total unique values less than 3
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
    print('%d columns were label encoded.' % le_count)

def Remove_anomaly(name,lst,typ):
    for var in lst:
        temp_data = name[var]
        count = 0
        count1 = 0
        for val in temp_data:
            if typ == 1:
                if val<0:
                    count +=1
                    name.at[count1,var] = np.nan
            else:
                if val>0:
                    count +=1
                    name.at[count1,var] = np.nan
            count1+=1
        print('Total anomalies ='+str(count))   
        #Plot_hist(temp_data,var)
        return

def clean_features(d_file,no_unique= 0,percentage= 100):
    cat_cols = []
    int_cols = []
    
    ln = len(d_file.columns)
    #no_unique, number of unique values
    #percentage, max percentage the no_unique can take
    
    for col in d_file:
        k = (d_file[col].value_counts()/d_file[col].count())*100
        if sum(list(k.head(no_unique)))<percentage:
            if d_file[col].dtypes == "object":
                cat_cols.append(col)
            else: int_cols.append(col)
                
    #removing all features execpt the ones that have passed the above test
    d_file = d_file.drop(train_file.columns.difference(cat_cols+int_cols),1)
    
    print(ln-len(d_file.columns),"Features dropped")
    

def treat_iqr(d_file):
    count = 0
    for col in d_file:
        if col != 'TARGET':
            skew_prev = d_file[col].skew()
    
            Q1 = d_file[col].quantile(0.25)
            Q2 = d_file[col].quantile(0.75)
            IQR = Q2-Q1
        
            Q3 = Q2-3*IQR
            Q4 = Q3+3*IQR
        
            d_file[col] = np.where(d_file[col]<Q3,Q3,d_file[col])
            d_file[col] = np.where(d_file[col]>Q4,Q4,d_file[col])
        
            skew_after = d_file[col].skew()
        
            if (skew_prev != skew_after):
                count += 1
            
            print(col,"Skew_prev:",skew_prev,"Skew_after:",skew_after)
    
    print("Outliers changed in:",count,"columns")

def reduce_memory(df):
    """Reduce memory usage of a dataframe by setting data types. """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Initial df memory usage is {:.2f} MB for {} columns'
          .format(start_mem, len(df.columns)))

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            cmin = df[col].min()
            cmax = df[col].max()
            if str(col_type)[:3] == 'int':
                # Can use unsigned int here too
                if cmin > np.iinfo(np.int8).min and cmax < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif cmin > np.iinfo(np.int16).min and cmax < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif cmin > np.iinfo(np.int32).min and cmax < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif cmin > np.iinfo(np.int64).min and cmax < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if cmin > np.finfo(np.float16).min and cmax < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif cmin > np.finfo(np.float32).min and cmax < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    memory_reduction = 100 * (start_mem - end_mem) / start_mem
    print('Final memory usage is: {:.2f} MB - decreased by {:.1f}%'.format(end_mem, memory_reduction))
    return df

def one_hot_encoder(df, categorical_columns=None, nan_as_category=True):
    """Create a new column for each categorical value in categorical columns. """
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object' and "SK_ID" not in col]
    original_columns = [col for col in original_columns if col not in categorical_columns]
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    categorical_columns = [c for c in df.columns if c not in original_columns]
    return df, categorical_columns, original_columns

In [3]:
df=pd.read_csv('../input/projectdf/DF_FE.csv',sep=',',low_memory=False)

In [4]:
reduce_memory(df)

Initial df memory usage is 1299.75 MB for 554 columns
Final memory usage is: 306.17 MB - decreased by 76.4%


Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
0,0.0,0,360000.0,1125000.0,33025.5,1125000.0,0.022797,51.875000,20.171875,6.441406,...,0,0,0,0,0,0,1,0,0,1
1,0.0,0,112500.0,251280.0,13630.5,180000.0,0.022629,38.031250,3.814453,15.601562,...,0,0,0,0,0,0,1,0,0,1
2,0.0,0,225000.0,544491.0,15916.5,454500.0,0.035797,54.281250,5.679688,5.808594,...,0,0,0,1,0,0,0,1,0,0
3,0.0,2,211500.0,900000.0,26316.0,900000.0,0.006207,35.687500,6.035156,3.265625,...,0,0,0,0,0,0,1,0,0,1
4,1.0,0,90000.0,113760.0,8406.0,90000.0,0.020706,50.343750,4.636719,22.671875,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,,1,202500.0,835380.0,40320.0,675000.0,0.035797,35.812500,5.082031,18.875000,...,0,0,0,0,0,0,1,0,0,1
307507,,0,450000.0,1800000.0,56520.0,1800000.0,0.002506,40.000000,20.093750,23.671875,...,0,0,0,0,0,0,0,1,0,0
307508,,1,112500.0,301095.0,23773.5,279000.0,0.019104,42.750000,5.933594,0.325684,...,0,0,0,0,0,0,1,0,0,1
307509,,0,94500.0,180000.0,9000.0,180000.0,0.011703,22.453125,1.686523,22.437500,...,0,0,0,0,1,0,0,1,0,0


In [5]:
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        nseeds = 1
        for s in range(nseeds):
                   
            # LightGBM parameters found by Bayesian optimization
            #clf = LGBMClassifier(
            #    nthread=4,
            #    n_estimators=10000,
            #   learning_rate=0.02,
            #    num_leaves=34,
            #    colsample_bytree=0.8,
            #    subsample=0.87,
            #    max_depth=8,
            #    reg_alpha=0.041545473,
            #    reg_lambda=0.0735294,
            #    min_split_gain=0.0222415,
            #    min_child_weight=40,
            #   silent=-1,
            #    verbose=-1, 
            #    seed=s,
            #    random_state=s)
            clf=LGBMRegressor(objective = 'binary',
                                metric = 'binary_logloss',
                                verbosity = -1,
                                boosting_type = 'gbdt',
                                feature_pre_filter = False,
                                lambda_l1 = 7.497814244329271,
                                lambda_l2 = 0.22692154687765595,
                                num_leaves = 5,
                                feature_fraction = 0.4,
                                bagging_fraction = 1.0,
                                bagging_freq = 0,
                                min_child_samples = 5,
                                learning_rate=0.01,
                                seed = s,
                                n_estimators = 100000                            
                              )
    
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                eval_metric= 'auc', verbose= 200, early_stopping_rounds= 1000)
    
            """""oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
            y_pred = clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1]
            sub_preds +=  pd.Series(y_pred).rank().values #/ folds.n_splits
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = s
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
            
            print('Fold %2d Seed %i AUC : %.6f' % (n_fold + 1, s, roc_auc_score(valid_y, oof_preds[valid_idx])))
        """
        del train_x, train_y, valid_x, valid_y
        gc.collect()


    return clf

In [6]:
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [7]:
feat_importance = kfold_lightgbm(df, num_folds= 5, stratified= False, debug= False)

Starting LightGBM. Train shape: (199882, 554), test shape: (107629, 554)
Training until validation scores don't improve for 1000 rounds
[200]	training's auc: 0.744018	training's binary_logloss: 0.258037	valid_1's auc: 0.737067	valid_1's binary_logloss: 0.259596
[400]	training's auc: 0.755391	training's binary_logloss: 0.250992	valid_1's auc: 0.749417	valid_1's binary_logloss: 0.252727
[600]	training's auc: 0.764224	training's binary_logloss: 0.247344	valid_1's auc: 0.757814	valid_1's binary_logloss: 0.249332
[800]	training's auc: 0.77045	training's binary_logloss: 0.244948	valid_1's auc: 0.763067	valid_1's binary_logloss: 0.247306
[1000]	training's auc: 0.775008	training's binary_logloss: 0.243171	valid_1's auc: 0.766616	valid_1's binary_logloss: 0.245912
[1200]	training's auc: 0.778516	training's binary_logloss: 0.241798	valid_1's auc: 0.769109	valid_1's binary_logloss: 0.244924
[1400]	training's auc: 0.781448	training's binary_logloss: 0.240681	valid_1's auc: 0.771082	valid_1's binar

In [8]:
Test_file = pd.read_csv('../input/iiitb2020-home-credit-default-risk/application_test.csv',sep = ',', low_memory = False)
print("Test_file Imported")

Test_file Imported


In [9]:
ln = 199882
sample = int(ln*3/10)
ln1 = ln-sample

X_train = df.iloc[:ln1,1:]
y_train = df.iloc[:ln1,:1]

X_test = df.iloc[ln1:ln1+sample,1:]
y_test = df.iloc[ln1:ln1+sample,:1]

In [10]:
train_pred = feat_importance.predict(X_train)
test_pred = feat_importance.predict(X_test)
print("Train Accuracy:",roc_auc_score(y_train,train_pred)*100)
print("Test Accuracy:",roc_auc_score(y_test,test_pred)*100)
gc.collect()

Train Accuracy: 83.4571843320211
Test Accuracy: 83.70225057377922


4

In [11]:
ln = 199882
test_df = df.iloc[ln:,1:]
#test_df = test_df.drop("TARGET",axis=1)
# Write submission file and plot feature importance
sub_preds = feat_importance.predict(test_df)

In [12]:
sub_preds

array([0.02444   , 0.09798999, 0.09297215, ..., 0.03462825, 0.07172402,
       0.08201496])

In [13]:
ID = Test_file.drop(Test_file.columns.difference(["SK_ID_CURR"]),1)

submission = pd.DataFrame({'SK_ID_CURR': ID['SK_ID_CURR'], 'TARGET' : sub_preds})
submission.to_csv('submission.csv',index=False)

In [14]:
submission.describe()

Unnamed: 0,TARGET
count,107629.0
mean,0.080929
std,0.09273
min,0.001121
25%,0.024221
50%,0.047665
75%,0.098919
max,0.898557
