In [1]:
import numpy as np
import pandas as pd
from typing import Optional, List
from sklearn.model_selection import train_test_split

import sklearn.base

In [2]:
data = pd.read_csv('./application_train.csv')

In [3]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB


In [5]:
data.shape

(307511, 122)

In [6]:
seed = 24
target_column = "TARGET"

drop_columns = ["SK_ID_CURR", "REGION_POPULATION_RELATIVE", "FLAG_MOBIL", 
                "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", "FLAG_CONT_MOBILE",
                "FLAG_PHONE", "FLAG_EMAIL", "REGION_RATING_CLIENT_W_CITY", 
               "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START"]
data = data.drop(drop_columns, axis=1)

test_size = 0.2
data_train, data_test, Y_train, Y_test = train_test_split(
    data[data.columns.drop(target_column)],
    np.array(data[target_column]),
    test_size=test_size,
    random_state=seed)

print(f"Train : {data_train.shape} {Y_train.shape}")
print(f"Test : {data_test.shape} {Y_test.shape}")

Train : (246008, 110) (246008,)
Test : (61503, 110) (61503,)


In [7]:
continuous_columns = [key for key in data.keys() if data[key].dtype in ("int64", "float64")]
categorical_columns = [key for key in data.keys() if data[key].dtype == "object"]

continuous_columns.remove(target_column)

print(f"Continuous : {len(continuous_columns)}, Categorical : {len(categorical_columns)}")

Continuous : 95, Categorical : 15


In [8]:
from sklearn.preprocessing import LabelEncoder
def transform(data, categorical):
    encoder = LabelEncoder()
    for col in categorical:
        data[f"{col}_encoded"] = encoder.fit_transform(data[col])
        data = data.drop(col, axis=1)
    data = data.fillna(data.mean())
    return data

In [9]:
data_train = transform(data_train, categorical_columns)

In [10]:
data_train.shape

(246008, 110)

In [11]:
from sklearn.preprocessing import StandardScaler
columns = list(data_train.columns.values)

scaler = StandardScaler()
data_train = scaler.fit_transform(data_train)
data_train = pd.DataFrame(data_train, columns=columns)

In [12]:
data_train.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,NAME_INCOME_TYPE_encoded,NAME_EDUCATION_TYPE_encoded,NAME_FAMILY_STATUS_encoded,NAME_HOUSING_TYPE_encoded,OCCUPATION_TYPE_encoded,ORGANIZATION_TYPE_encoded,FONDKAPREMONT_MODE_encoded,HOUSETYPE_MODE_encoded,WALLSMATERIAL_MODE_encoded,EMERGENCYSTATE_MODE_encoded
0,-0.576526,-0.440886,-0.784426,-0.800841,-0.690606,1.681837,-0.452413,-0.967005,1.076425,0.0,...,0.916101,-1.684039,1.30904,3.890761,0.497173,0.221758,0.635064,0.992259,0.824353,1.049254
1,2.190755,-0.302547,-0.472732,-0.80768,-0.54414,-0.509295,-0.502858,-1.380947,0.824726,0.570824,...,0.916101,0.62515,-0.402993,-0.305655,-0.509984,-1.000189,0.635064,0.992259,0.824353,1.049254
2,-0.576526,-0.389009,-1.274316,-1.519871,-1.252061,-1.802658,2.134355,-1.371571,-1.042487,0.0,...,-0.655852,0.62515,1.30904,-0.305655,1.16861,1.345949,0.635064,-1.015712,-0.361753,-0.960368
3,-0.576526,0.0433,-0.370177,-0.42283,-0.239001,0.433396,-0.457667,-0.642272,-1.46309,0.0,...,0.916101,0.62515,-1.25901,-0.305655,-0.509984,-1.195701,0.635064,0.322935,-0.361753,-0.960368
4,-0.576526,-0.337132,-0.01549,0.226877,-0.239001,1.221837,-0.456527,-0.146507,0.055055,0.0,...,-1.441829,0.62515,-0.402993,3.890761,0.497173,0.612781,0.635064,0.992259,0.824353,1.049254


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

clf = LogisticRegression(max_iter=1000)
clf.fit(data_train, Y_train)

In [26]:
# Prediction
data_test = transform(data_test, categorical_columns)
data_test = scaler.transform(data_test)  

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [15]:
from sklearn.metrics import mean_absolute_error
y_pred = clf.predict_proba(data_test)
score = mean_absolute_error([x[1] for x in y_pred], Y_test)
print(score)

In [16]:
data_res = pd.read_csv('./application_test.csv')
data_res.shape

(48744, 121)

In [17]:
data_res.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


In [18]:
ids = data_res['SK_ID_CURR'].tolist()
drop_columns = ["SK_ID_CURR", "REGION_POPULATION_RELATIVE", "FLAG_MOBIL", 
                "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", "FLAG_CONT_MOBILE",
                "FLAG_PHONE", "FLAG_EMAIL", "REGION_RATING_CLIENT_W_CITY", 
               "WEEKDAY_APPR_PROCESS_START", "HOUR_APPR_PROCESS_START"]
data_res = data_res.drop(drop_columns, axis=1)
data_res = transform(data_res, categorical_columns)
data_res = scaler.transform(data_res)
res_pred = clf.predict_proba(data_res)



In [19]:
print(res_pred[:10])

[[1.  0. ]
 [1.  0. ]
 [0.8 0.2]
 [1.  0. ]
 [0.8 0.2]
 [0.8 0.2]
 [1.  0. ]
 [0.8 0.2]
 [1.  0. ]
 [0.8 0.2]]


In [20]:
lists = [ids, [x[1] for x in res_pred]]
result = pd.concat([pd.Series(x) for x in lists], axis=1)


In [21]:
result.head()

Unnamed: 0,0,1
0,100001,0.0
1,100005,0.0
2,100013,0.2
3,100028,0.0
4,100038,0.2


In [22]:
result.to_csv("submission.csv", sep=',', encoding='utf-8', index=False)

In [23]:
# SK_ID_CURR,TARGET