### 必要なライブラリのインポート

In [63]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix


### CSVファイルの読み込み

In [64]:
# CSVファイルの読み込み
df = pd.read_csv('ks-projects-201801.csv')
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [65]:
# 統計量の算出
df.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,378661.0,378661.0,378661.0,378661.0,374864.0,378661.0,378661.0
mean,1074731000.0,49080.79,9682.979,105.617476,7036.729,9058.924,45454.4
std,619086200.0,1183391.0,95636.01,907.185035,78639.75,90973.34,1152950.0
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,538263500.0,2000.0,30.0,2.0,16.98,31.0,2000.0
50%,1075276000.0,5200.0,620.0,12.0,394.72,624.33,5500.0
75%,1610149000.0,16000.0,4076.0,56.0,3034.09,4050.0,15500.0
max,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,20338990.0,166361400.0


### 欠損値の処理

In [66]:
# 欠損値の確認
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [67]:
# 暫定で０埋めにする
df['usd pledged'] = df['usd pledged'].fillna('0')
df.isnull().sum()

ID                  0
name                4
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

### ダミー変数の作成

### 日付データの処理

In [68]:
# 日付型に変換
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(pd.to_datetime(df['launched']).dt.strftime('%Y-%m-%d'))

In [69]:
# launched と deadline の差をとって、日数を計算
df['datenum'] = (df['deadline']-df['launched']).astype('timedelta64[D]')
df

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,datenum
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.00,failed,0,GB,0,0.00,1533.95,59.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02,2421.00,failed,15,US,100,2421.00,30000.00,60.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.00,failed,3,US,220,220.00,45000.00,45.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.00,failed,1,US,1,1.00,5000.00,30.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.00,canceled,14,US,1283,1283.00,19500.00,56.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.00,successful,224,US,52375,52375.00,50000.00,35.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.00,successful,16,US,1205,1205.00,1000.00,20.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01,453.00,failed,40,US,453,453.00,25000.00,45.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24,8233.00,canceled,58,US,8233,8233.00,125000.00,35.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11,6240.57,canceled,43,US,6240.57,6240.57,65000.00,30.0


### 目的変数の処理

In [70]:
df.loc[df['state'] == 'successful', 'state_num'] = 1
df.loc[df['state'] != 'successful', 'state_num'] = 0
df

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,datenum,state_num
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11,0.00,failed,0,GB,0,0.00,1533.95,59.0,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02,2421.00,failed,15,US,100,2421.00,30000.00,60.0,0.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12,220.00,failed,3,US,220,220.00,45000.00,45.0,0.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17,1.00,failed,1,US,1,1.00,5000.00,30.0,0.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04,1283.00,canceled,14,US,1283,1283.00,19500.00,56.0,0.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26,52375.00,successful,224,US,52375,52375.00,50000.00,35.0,1.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01,1205.00,successful,16,US,1205,1205.00,1000.00,20.0,1.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01,453.00,failed,40,US,453,453.00,25000.00,45.0,0.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24,8233.00,canceled,58,US,8233,8233.00,125000.00,35.0,0.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11,6240.57,canceled,43,US,6240.57,6240.57,65000.00,30.0,0.0


### カテゴリ値の処理

In [85]:
# OneHotEncoding版
df_OneHot = pd.get_dummies(df, columns=['category','main_category','currency','country'])
df_OneHot.describe()

Unnamed: 0,ID,goal,pledged,backers,usd_pledged_real,usd_goal_real,datenum,state_num,category_3D Printing,category_Academic,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
count,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,...,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0,378661.0
mean,1074731000.0,49080.79,9682.979,105.617476,9058.924,45454.4,34.481095,0.353762,0.001804,0.002419,...,0.000106,0.000164,0.004627,0.010027,0.007574,0.00187,0.003821,0.00464,0.001466,0.772794
std,619086200.0,1183391.0,95636.01,907.185035,90973.34,1152950.0,65.909173,0.478137,0.042432,0.049124,...,0.010277,0.012795,0.067863,0.099634,0.086699,0.0432,0.061699,0.06796,0.038256,0.419027
min,5971.0,0.01,0.0,0.0,0.0,0.01,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,538263500.0,2000.0,30.0,2.0,31.0,2000.0,30.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1075276000.0,5200.0,620.0,12.0,624.33,5500.0,30.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1610149000.0,16000.0,4076.0,56.0,4050.0,15500.0,37.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2147476000.0,100000000.0,20338990.0,219382.0,20338990.0,166361400.0,16739.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [86]:
# カテゴリ値版
df_category = df.copy()
# categoryをカテゴリ値に変換
labels, uniques = pd.factorize(df_category['category'])
df_category['category'] = labels

# main_categoryをカテゴリ値に変換
labels, uniques = pd.factorize(df_category['main_category'])
df_category['main_category'] = labels

# currencyをカテゴリ値に変換
labels, uniques = pd.factorize(df_category['currency'])
df_category['currency'] = labels

# currencyをカテゴリ値に変換
labels, uniques = pd.factorize(df_category['country'])
df_category['country'] = labels

In [87]:
df_category.head(3)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,datenum,state_num
0,1000002330,The Songs of Adelaide & Abullah,0,0,0,2015-10-09,1000.0,2015-08-11,0.0,failed,0,0,0,0.0,1533.95,59.0,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,1,1,1,2017-11-01,30000.0,2017-09-02,2421.0,failed,15,1,100,2421.0,30000.0,60.0,0.0
2,1000004038,Where is Hank?,1,1,1,2013-02-26,45000.0,2013-01-12,220.0,failed,3,1,220,220.0,45000.0,45.0,0.0


### 説明変数・目的変数の取り出し

In [88]:
# 不要な説明変数の設定
drop_col = ['ID', 'name','deadline','launched','state']

In [89]:
# OneHotEncoding版
df_OneHot=df_OneHot.drop(drop_col, axis=1)
df_OneHot.head(3)

Unnamed: 0,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,datenum,state_num,category_3D Printing,category_Academic,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000.0,0.0,0,0,0.0,1533.95,59.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30000.0,2421.0,15,100,2421.0,30000.0,60.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,45000.0,220.0,3,220,220.0,45000.0,45.0,0.0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [90]:
# カテゴリ値版
df_category=df_category.drop(drop_col, axis=1)
df_category.head(3)

Unnamed: 0,category,main_category,currency,goal,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,datenum,state_num
0,0,0,0,1000.0,0.0,0,0,0,0.0,1533.95,59.0,0.0
1,1,1,1,30000.0,2421.0,15,1,100,2421.0,30000.0,60.0,0.0
2,1,1,1,45000.0,220.0,3,1,220,220.0,45000.0,45.0,0.0


### 予測実施

In [94]:
# カテゴリ値版
y = df_category['state_num'].values
X = df_category.drop('state_num', axis=1).values

clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)
clf.fit(X, y)

# 重みを取得して表示
w0 = clf.intercept_[0]
w1 = clf.coef_[0, 0]
w2 = clf.coef_[0, 1]
w3 = clf.coef_[0, 2]
w4 = clf.coef_[0, 3]
w5 = clf.coef_[0, 4]
w6 = clf.coef_[0, 5]
w7 = clf.coef_[0, 6]
w8 = clf.coef_[0, 7]
w9 = clf.coef_[0, 8]
w10 = clf.coef_[0, 9]
w11 = clf.coef_[0, 10]
print('w0 = {:.3f}, w1 = {:.3f}, w2 = {:.3f}, w3 = {:.3f}, w4 = {:.3f}, w5 = {:.3f}, w6 = {:.3f}, w7 = {:.3f}, w8 = {:.3f}, w9 = {:.3f}, w10 = {:.3f}, w11 = {:.3f}'.format(w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11))


w0 = 41.247, w1 = 1380.681, w2 = -481.008, w3 = -100.804, w4 = -361.775, w5 = 309.246, w6 = 2373.372, w7 = -2303.177, w8 = 89.787, w9 = 96.586, w10 = -49.394, w11 = -1751.374


In [95]:
# ラベルを予測
y_est = clf.predict(X)

# 対数尤度を表示
print('対数尤度 = {:.3f}'.format(- log_loss(y, y_est)))

# 正答率を表示
print('正答率 = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

対数尤度 = -0.669
正答率 = 98.064%


In [96]:
# 予測値と正解のクロス集計
conf_mat = pd.DataFrame(confusion_matrix(y, y_est), 
                        index=['正解 = リノベーションなし', '正解 = リノベーション済み'], 
                        columns=['予測 = リノベーションなし', '予測 = リノベーション済み'])
conf_mat

Unnamed: 0,予測 = リノベーションなし,予測 = リノベーション済み
正解 = リノベーションなし,239326,5379
正解 = リノベーション済み,1950,132006
