In [1]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import lightgbm as lgb
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')

In [2]:
# 各種設定
DICT_DTYPE = {'LOAN_ID': 'str', 'IMAGE_ID': 'str'}
# os.chdir('/work/')
os.chdir('/Users/kinoshitashouhei/Desktop/competitions/05_Prob_Space/Kiva/')

In [3]:
# 自作関数のインポート
from functions.common import *
from functions import preprocessing

In [4]:
# データの読み込み
df_train = pd.read_csv('data/train.csv', dtype=DICT_DTYPE)
df_test = pd.read_csv('data/test.csv', dtype=DICT_DTYPE)

In [5]:
df_train.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,LOAN_AMOUNT,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
0,1733169,English,Teodora is a 50-year-old married woman from th...,Teodora is a 50-year-old married woman from th...,100,3115271,Weaving,Arts,"to purchase materials like nipa palm, bamboo ...",PH,Philippines,"Maribojoc, Bohol",shared,0.1,PHP,#Elderly,monthly,field_partner
1,1546998,English,Diego is 32 years old and lives in the municip...,Diego is 32 years old and lives in the municip...,1350,2870403,Barber Shop,Services,"to buy two hair clippers, a new barber chair, ...",CO,Colombia,Apartadó,shared,0.1,COP,"user_favorite, user_favorite",monthly,field_partner
2,1808517,Spanish,"Osman, es un joven de 27 años de edad, soltero...","Osman is a young man, 27 years old, single, an...",225,3215705,Farming,Agriculture,to purchase sacks of fertilizers to care for a...,HN,Honduras,"Nueva Frontera, Santa Barbara.",shared,0.1,HNL,,bullet,field_partner
3,1452940,English,"His name is Nino, 31 years old, married to Che...","His name is Nino, 31 years old, married to Che...",350,2745031,Motorcycle Transport,Transportation,"to pay for fuel, tires and change oil for his ...",PH,Philippines,"Silang, Cavite",shared,0.1,PHP,user_favorite,monthly,field_partner
4,1778420,English,"Pictured above is Teresa, often described as a...","Pictured above is Teresa, often described as a...",625,3083800,Farming,Agriculture,to purchase hybrid seeds and fertilizer to imp...,KE,Kenya,Mumias,shared,0.1,KES,"#Eco-friendly, #Sustainable Ag, #Parent, #Elde...",bullet,field_partner


In [6]:
df_test.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
0,2041445,English,Marcela is 69 years old and married with ten c...,Marcela is 69 years old and married with ten c...,4051101,General Store,Retail,to buy items to sell like canned goods and per...,PH,Philippines,"Cauayan, Negros Occidental",standard,,PHP,,monthly,field_partner
1,1944435,English,Roselia is 48 years old and has five children....,Roselia is 48 years old and has five children....,3410523,Pigs,Agriculture,to buy feeds and other supplies to raise her pig,PH,Philippines,"Guihulngan, Negros Oriental",standard,,PHP,"#Animals, #Repeat Borrower, #Schooling, #Woman...",monthly,field_partner
2,2083354,English,"Ma. Marebil is a single woman, 40 years old wi...","Ma. Marebil is a single woman, 40 years old wi...",4146690,Clothing Sales,Clothing,to buy additional stock of clothes and dresses...,PH,Philippines,"Santa Barbara, Iloilo",standard,,PHP,"#Parent, #Single Parent, #Woman-Owned Business",monthly,field_partner
3,1993565,English,"Good day, lenders! Meet one of KBMI’s clients,...","Good day, lenders! Meet one of KBMI’s clients,...",3945982,Food,Food,to buy more foods to grow her business.,ID,Indonesia,Pandeglang,shared,0.1,IDR,"#Woman-Owned Business, #Schooling, #Elderly, #...",monthly,field_partner
4,2064272,English,Rosemarie is a married woman with two children...,Rosemarie is a married woman with two children...,4114040,Food,Food,to buy ingredients for her food production bus...,PH,Philippines,Sogod Cebu,standard,,PHP,,monthly,field_partner


In [7]:
# データサイズの確認
print(f'df_train size : {df_train.shape}')
print(f'df_test size : {df_test.shape}')

df_train size : (91333, 18)
df_test size : (91822, 17)


In [8]:
# データの型の確認
print(f'df_train dtypes\n{df_train.dtypes}')
print('='*50)
print(f'df_test dtypes\n{df_test.dtypes}')

df_train dtypes
LOAN_ID                             object
ORIGINAL_LANGUAGE                   object
DESCRIPTION                         object
DESCRIPTION_TRANSLATED              object
LOAN_AMOUNT                          int64
IMAGE_ID                            object
ACTIVITY_NAME                       object
SECTOR_NAME                         object
LOAN_USE                            object
COUNTRY_CODE                        object
COUNTRY_NAME                        object
TOWN_NAME                           object
CURRENCY_POLICY                     object
CURRENCY_EXCHANGE_COVERAGE_RATE    float64
CURRENCY                            object
TAGS                                object
REPAYMENT_INTERVAL                  object
DISTRIBUTION_MODEL                  object
dtype: object
df_test dtypes
LOAN_ID                             object
ORIGINAL_LANGUAGE                   object
DESCRIPTION                         object
DESCRIPTION_TRANSLATED              object
IMAGE_ID 

In [29]:
# 欠損数の確認(訓練データ)
pd.concat([pd.DataFrame(df_train.isnull().sum(), columns=['n_null_train']), pd.DataFrame(df_test.isnull().sum(), columns=['n_null_test'])], axis=1)

Unnamed: 0,n_null_train,n_null_test
LOAN_ID,0,0.0
ORIGINAL_LANGUAGE,0,0.0
DESCRIPTION,0,0.0
DESCRIPTION_TRANSLATED,0,0.0
LOAN_AMOUNT,0,
IMAGE_ID,0,0.0
ACTIVITY_NAME,0,0.0
SECTOR_NAME,0,0.0
LOAN_USE,0,0.0
COUNTRY_CODE,0,0.0


In [7]:
# 欠損値埋め
df_train = preprocessing.fill_na_DESCRIPTION_TRANSLATED(df_train)

In [8]:
df_train.groupby(COL_CURRENCY_EXCHANGE_COVERAGE_RATE).agg({COL_LOAN_AMOUNT: ['min', 'max', 'median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,min,max,median,mean
CURRENCY_EXCHANGE_COVERAGE_RATE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0.0,25,8325,500.0,780.508574
0.1,25,10000,425.0,698.678089


In [9]:
df_train[COL_DESCRIPTION] = df_train[COL_DESCRIPTION].apply(preprocessing.replace_str)

In [23]:
df_train.groupby(COL_ORIGINAL_LANGUAGE).agg({COL_LOAN_AMOUNT: ['min', 'max', 'median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,min,max,median,mean
ORIGINAL_LANGUAGE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
English,25,10000,375.0,592.61433
French,50,10000,350.0,1020.749614
Portuguese,100,4425,650.0,1124.827109
Russian,25,3675,675.0,890.424901
Spanish,50,9850,800.0,1116.600759


In [24]:
df_train.groupby(COL_SECTOR_NAME).agg({COL_LOAN_AMOUNT: ['median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,median,mean
SECTOR_NAME,Unnamed: 1_level_2,Unnamed: 2_level_2
Agriculture,500.0,668.448828
Arts,500.0,1007.637835
Clothing,575.0,1092.978042
Construction,600.0,931.558363
Education,575.0,774.752335
Entertainment,725.0,1728.409091
Food,475.0,855.807176
Health,575.0,788.041569
Housing,200.0,414.722053
Manufacturing,550.0,751.488095


In [17]:
df_train[COL_COUNTRY_NAME].value_counts()

Philippines    21265
Kenya          12510
Uganda          5551
Colombia        4758
El Salvador     4100
               ...  
Panama            28
Puerto Rico       16
Lesotho           11
Israel            10
Armenia            1
Name: COUNTRY_NAME, Length: 61, dtype: int64

In [18]:
df_train[COL_TOWN_NAME].value_counts()

Kaduna                    1183
Antananarivo               895
Eldoret, Rift Valley       832
Banga, Aklan               808
Kitale                     799
                          ... 
PALCA, TUAO, CAGAYAN         1
Nabouwalu 2                  1
Rizal Alicia Isabela         1
Alamyudun district           1
Patulul, Suchitepequez       1
Name: TOWN_NAME, Length: 2781, dtype: int64

In [19]:
tmp = df_train.groupby(COL_COUNTRY_NAME, as_index=False).agg({COL_TOWN_NAME: ['count', 'nunique']})

In [20]:
tmp['percent'] = tmp.iloc[:, 2] / tmp.iloc[:, 1]

In [21]:
tmp

Unnamed: 0_level_0,COUNTRY_NAME,TOWN_NAME,TOWN_NAME,percent
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique,Unnamed: 4_level_1
0,Albania,320,8,0.025000
1,Armenia,1,1,1.000000
2,Bolivia,453,24,0.052980
3,Brazil,222,5,0.022523
4,Burkina Faso,606,14,0.023102
...,...,...,...,...
56,Turkey,8,1,0.125000
57,Uganda,5551,83,0.014952
58,United States,419,91,0.217184
59,Vietnam,1774,45,0.025366


In [22]:
df_train[(df_train[COL_COUNTRY_NAME]=='Kosovo') | (df_train[COL_COUNTRY_NAME]=='Senegal')]

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,LOAN_AMOUNT,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
62,1842028,French,Ce groupe a été créé en Juin 2019.Il est compo...,"Created in June 2019, this group is made up of...",4375,3261269,Livestock,Agriculture,to buy sheep.,SN,Senegal,,shared,0.1,XOF,"user_favorite, user_favorite, #Animals, #Paren...",bullet,field_partner
203,1634323,English,"Mehmet is 28 years old, married and the father...","Mehmet is 28 years old, married and the father...",1150,2983793,Personal Housing Expenses,Housing,to renovate Mehmet's family's bedroom and purc...,XK,Kosovo,,shared,0.1,EUR,"#Repair Renew Replace, user_favorite",monthly,field_partner
356,1634185,French,Le banc villageois dont fait partie ce groupe ...,The bank villageois to which this group of nin...,5275,2982905,Food,Food,to buy vegetables.,SN,Senegal,,shared,0.1,XOF,"#Repeat Borrower, user_favorite, #Schooling, u...",bullet,field_partner
400,1587043,French,Ces 07 femmes habitent dans le même village et...,These seven women live in the same village and...,1800,2908689,Retail,Retail,to buy brooms to resell.,SN,Senegal,,shared,0.1,XOF,"user_favorite, user_favorite",bullet,field_partner
568,1659748,French,Le banc villageois dont fait partie ce groupe ...,The banc villageois that this group of 11 wome...,1275,3013403,Food Market,Food,"to buy sugar, peanut butter, palm oil and powd...",SN,Senegal,,shared,0.1,XOF,"#Schooling, #Repeat Borrower, #Parent, user_fa...",bullet,field_partner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90737,1711706,French,Cette femme habite dans un village qui abrite ...,This woman lives in a village that has had a b...,675,3085792,Wholesale,Wholesale,to buy 15 tons of walnuts to sell on a wholesa...,SN,Senegal,,shared,0.1,XOF,"user_favorite, user_favorite, user_favorite, u...",bullet,field_partner
90753,1720393,French,Ce groupe est composé de 14 femmes qui partage...,This group is made up of 14 women who share st...,2425,3097967,Livestock,Agriculture,to buy chicks and feed.,SN,Senegal,,shared,0.1,XOF,"#Parent, #Animals, user_favorite, user_favorit...",irregular,field_partner
90896,1670046,French,Ces 07 femmes habitent dans le même quartier e...,These 7 women live in the same neighborhood an...,2175,3028170,Food Market,Food,"to buy palm oil, lemon juice, bouillon cubes a...",SN,Senegal,,shared,0.1,XOF,"user_favorite, #Elderly",monthly,field_partner
90964,1804527,English,Agron wants to sincerely thank all of the KIVA...,Agron wants to sincerely thank all of the KIVA...,1700,3210235,Auto Repair,Services,to purchase additional tools for his business ...,XK,Kosovo,,shared,0.1,EUR,"#Biz Durable Asset, #Supporting Family, user_f...",monthly,field_partner


In [23]:
df_train[COL_CURRENCY_POLICY].value_counts()

shared      82061
standard     9272
Name: CURRENCY_POLICY, dtype: int64

In [24]:
df_train.groupby(COL_CURRENCY_POLICY).agg({COL_LOAN_AMOUNT: ['median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,median,mean
CURRENCY_POLICY,Unnamed: 1_level_2,Unnamed: 2_level_2
shared,425.0,702.92313
standard,600.0,1049.921808


In [25]:
df_train[COL_CURRENCY_EXCHANGE_COVERAGE_RATE].value_counts()

0.1    77804
0.0     4257
Name: CURRENCY_EXCHANGE_COVERAGE_RATE, dtype: int64

In [26]:
df_train.groupby(COL_CURRENCY_EXCHANGE_COVERAGE_RATE).agg({COL_LOAN_AMOUNT: ['median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,median,mean
CURRENCY_EXCHANGE_COVERAGE_RATE,Unnamed: 1_level_2,Unnamed: 2_level_2
0.0,500.0,780.508574
0.1,425.0,698.678089


In [27]:
df_train[COL_CURRENCY].value_counts()

PHP    21265
KES    12510
USD    11331
UGX     5551
COP     4758
TJS     3648
KHR     2768
XOF     2548
INR     2123
VND     1774
PEN     1691
PYG     1668
LRD     1642
MGA     1583
PKR     1547
WST     1352
HNL     1239
NGN     1183
KGS      980
RWF      855
JOD      784
GHS      778
GTQ      742
IDR      699
NIO      645
MZN      510
EGP      501
HTG      453
BOB      436
MXN      433
GEL      375
ALL      330
SBD      324
FJD      270
TOP      260
CRC      259
BRL      221
MDL      201
DOP      162
TRY      152
XAF      139
EUR      132
MWK      130
SLL      122
ZMW       98
NPR       62
PGK       43
THB       33
LSL       11
ILS       11
AMD        1
Name: CURRENCY, dtype: int64

In [28]:
df_train[COL_REPAYMENT_INTERVAL].value_counts()

monthly      82375
bullet        6527
irregular     2431
Name: REPAYMENT_INTERVAL, dtype: int64

In [29]:
df_train.groupby(COL_REPAYMENT_INTERVAL).agg({COL_LOAN_AMOUNT: ['median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,median,mean
REPAYMENT_INTERVAL,Unnamed: 1_level_2,Unnamed: 2_level_2
bullet,500.0,827.390072
irregular,800.0,1028.784451
monthly,425.0,722.501973


In [30]:
df_train[COL_DISTRIBUTION_MODEL].value_counts()

field_partner    90909
direct             424
Name: DISTRIBUTION_MODEL, dtype: int64

In [31]:
df_train.groupby(COL_DISTRIBUTION_MODEL).agg({COL_LOAN_AMOUNT: ['median', 'mean']})

Unnamed: 0_level_0,LOAN_AMOUNT,LOAN_AMOUNT
Unnamed: 0_level_1,median,mean
DISTRIBUTION_MODEL,Unnamed: 1_level_2,Unnamed: 2_level_2
direct,5000.0,5673.349057
field_partner,450.0,715.132165


In [33]:
df_train.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,LOAN_AMOUNT,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
0,1733169,English,Teodora is a 50-year-old married woman from th...,Teodora is a 50-year-old married woman from th...,100,3115271,Weaving,Arts,"to purchase materials like nipa palm, bamboo ...",PH,Philippines,"Maribojoc, Bohol",shared,0.1,PHP,#Elderly,monthly,field_partner
1,1546998,English,Diego is 32 years old and lives in the municip...,Diego is 32 years old and lives in the municip...,1350,2870403,Barber Shop,Services,"to buy two hair clippers, a new barber chair, ...",CO,Colombia,Apartadó,shared,0.1,COP,"user_favorite, user_favorite",monthly,field_partner
2,1808517,Spanish,"Osman, es un joven de 27 años de edad, soltero...","Osman is a young man, 27 years old, single, an...",225,3215705,Farming,Agriculture,to purchase sacks of fertilizers to care for a...,HN,Honduras,"Nueva Frontera, Santa Barbara.",shared,0.1,HNL,,bullet,field_partner
3,1452940,English,"His name is Nino, 31 years old, married to Che...","His name is Nino, 31 years old, married to Che...",350,2745031,Motorcycle Transport,Transportation,"to pay for fuel, tires and change oil for his ...",PH,Philippines,"Silang, Cavite",shared,0.1,PHP,user_favorite,monthly,field_partner
4,1778420,English,"Pictured above is Teresa, often described as a...","Pictured above is Teresa, often described as a...",625,3083800,Farming,Agriculture,to purchase hybrid seeds and fertilizer to imp...,KE,Kenya,Mumias,shared,0.1,KES,"#Eco-friendly, #Sustainable Ag, #Parent, #Elde...",bullet,field_partner


In [45]:
len(X.toarray()[0])

55030

In [51]:
tfidf_lda = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ("LDA", LatentDirichletAllocation(n_components=50, random_state=42))
])

In [52]:
features_lda = tfidf_lda.fit_transform(df_train[COL_DESCRIPTION_TRANSLATED])