## 目的：あるクラウドファンディングが成功するかを事前に予測するモデルを構築する

In [5]:
import markdown
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [6]:
#データの読み込み
df = pd.read_csv("ks-projects-201801.csv")
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [7]:
#不要なデータと予測する段階で知り得ないデータを除外
df = df.drop(columns=['ID','name', 'goal', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real'])
df.head()

Unnamed: 0,category,main_category,currency,deadline,launched,state,country,usd_goal_real
0,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,failed,GB,1533.95
1,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,failed,US,30000.0
2,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,failed,US,45000.0
3,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,failed,US,5000.0
4,Film & Video,Film & Video,USD,2015-08-29,2015-07-04 08:35:03,canceled,US,19500.0


In [5]:
#データの確認
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   category       378661 non-null  object 
 1   main_category  378661 non-null  object 
 2   currency       378661 non-null  object 
 3   deadline       378661 non-null  object 
 4   launched       378661 non-null  object 
 5   state          378661 non-null  object 
 6   country        378661 non-null  object 
 7   usd_goal_real  378661 non-null  float64
dtypes: float64(1), object(7)
memory usage: 23.1+ MB


In [6]:
#データの統計量の確認
df.describe()

Unnamed: 0,usd_goal_real
count,378661.0
mean,45454.4
std,1152950.0
min,0.01
25%,2000.0
50%,5500.0
75%,15500.0
max,166361400.0


In [7]:
#欠損値の確認
df.isnull().sum()

category         0
main_category    0
currency         0
deadline         0
launched         0
state            0
country          0
usd_goal_real    0
dtype: int64

In [8]:
#'category', 'main_category'が何種類あるのかの確認
df[['category', 'main_category']].nunique()

category         159
main_category     15
dtype: int64

In [9]:
#'main_category'のそれぞれのカテゴリに何個あるのかの確認
df['main_category'].value_counts()

Film & Video    63585
Music           51918
Publishing      39874
Games           35231
Technology      32569
Design          30070
Art             28153
Food            24602
Fashion         22816
Theater         10913
Comics          10819
Photography     10779
Crafts           8809
Journalism       4755
Dance            3768
Name: main_category, dtype: int64

In [10]:
#'state'のそれぞれのカテゴリに何個あるのかの確認
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [8]:
#'state'列が'failed'と'successful'だけのデータを残し、データを数値に変換
df = df[(df['state'] == 'failed' ) | (df['state'] == 'successful')]
df['state'] = df['state'].map({
    'failed':0,
    'successful':1
})
df['state'].value_counts()

0    197719
1    133956
Name: state, dtype: int64

In [12]:
df.head()

Unnamed: 0,category,main_category,currency,deadline,launched,state,country,usd_goal_real
0,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,0,GB,1533.95
1,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,0,US,30000.0
2,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,0,US,45000.0
3,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,0,US,5000.0
5,Restaurants,Food,USD,2016-04-01,2016-02-26 13:38:27,1,US,50000.0


In [9]:
#クラウドファンディングの期間を新たな説明変数に加える

#'deadline'と'launched'を変換
df['deadline'] = pd.to_datetime(df['deadline'], format = '%Y-%m-%d %H:%M:%S')
df['launched'] = pd.to_datetime(df['launched'], format = '%Y-%m-%d %H:%M:%S')

#クラウドファンディングの期間を表す説明変数'period'を導入
df['period'] = (df['deadline'] - df['launched']).dt.days

df.head()

Unnamed: 0,category,main_category,currency,deadline,launched,state,country,usd_goal_real,period
0,Poetry,Publishing,GBP,2015-10-09,2015-08-11 12:12:28,0,GB,1533.95,58
1,Narrative Film,Film & Video,USD,2017-11-01,2017-09-02 04:43:57,0,US,30000.0,59
2,Narrative Film,Film & Video,USD,2013-02-26,2013-01-12 00:20:50,0,US,45000.0,44
3,Music,Music,USD,2012-04-16,2012-03-17 03:24:11,0,US,5000.0,29
5,Restaurants,Food,USD,2016-04-01,2016-02-26 13:38:27,1,US,50000.0,34


In [10]:
#'deadline'と'launched'列を削除
df = df.drop(columns = ['deadline', 'launched'])
df.head()

Unnamed: 0,category,main_category,currency,state,country,usd_goal_real,period
0,Poetry,Publishing,GBP,0,GB,1533.95,58
1,Narrative Film,Film & Video,USD,0,US,30000.0,59
2,Narrative Film,Film & Video,USD,0,US,45000.0,44
3,Music,Music,USD,0,US,5000.0,29
5,Restaurants,Food,USD,1,US,50000.0,34


In [11]:
#カテゴリ変数をダミー変数に置き換える
df = pd.get_dummies(df, columns=['category', 'main_category', 'currency', 'country'])
df.head()

Unnamed: 0,state,usd_goal_real,period,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0,1533.95,58,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,30000.0,59,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,45000.0,44,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,5000.0,29,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,1,50000.0,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
#データの確認
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331675 entries, 0 to 378660
Columns: 214 entries, state to country_US
dtypes: float64(1), int64(2), uint8(211)
memory usage: 76.9 MB


In [13]:
#"state"を目的変数に設定、"state"以外を説明変数に設定
y = df["state"].values
X = df.drop('state', axis=1).values

#ロジスティック回帰モデルを作成
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234, tol=1e-3)

#学習させる
clf.fit(X,y)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log',
              max_iter=10000, n_iter_no_change=5, n_jobs=None, penalty='none',
              power_t=0.5, random_state=1234, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [15]:
#ラベルを予測
y_est = clf.predict(X)

#確率値を得る
y_est_proba = clf.predict_proba(X)

#対数尤度を表示
print("対数尤度 = {:.3f}".format(-log_loss(y, y_est_proba, normalize=False)))

# 正答率(Accuracy)を表示
print('正答率(Accuracy) = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

対数尤度 = -5758719.266
正答率(Accuracy) = 49.730%


In [16]:
#混同行列を作成
conf_mat = pd.DataFrame(confusion_matrix(y, y_est),
                       index=['正解 = クラウドファンディング失敗', '正解 = クラウドファンディング成功'],
                       columns=['予測 = クラウドファンディング失敗', '予測 = クラウドファンディング成功']
                       )
conf_mat

Unnamed: 0,予測 = クラウドファンディング失敗,予測 = クラウドファンディング成功
正解 = クラウドファンディング失敗,42280,155439
正解 = クラウドファンディング成功,11293,122663


In [17]:
from  sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# 正答率(Accuracy)を表示
print('正答率(Accuracy) = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

#Recall,Precision,F1-scoreを計算
precision, recall, f1_score, _ = precision_recall_fscore_support(y, y_est)

# クラウドファンディング成功に関するPrecision, Recall, F1-scoreを表示
print('適合率（Precision） = {:.3f}%'.format(100 * precision[1]))
print('再現率（Recall） = {:.3f}%'.format(100 * recall[1]))
print('F1値（F1-score） = {:.3f}%'.format(100 * f1_score[1]))

正答率(Accuracy) = 49.730%
適合率（Precision） = 44.107%
再現率（Recall） = 91.570%
F1値（F1-score） = 59.537%
