In [None]:
# gc mount
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# dataset unzip
!cp "/content/drive/MyDrive/Colab Notebooks/note_Axross/ks-projects-201801.csv.zip" /content/
!unzip ks-projects-201801.csv.zip

In [None]:
# Package import
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
# dataset check
data_path = '/content/'
df_f18 = pd.read_csv(f'{data_path}/ks-projects-201801.csv')
display(df_f18.head())

In [None]:
# data type view
display(df_f18.describe()) # 統計値
display(df_f18.isnull().sum()) # 欠損値
display(df_f18.dtypes)

In [None]:
# data pre edit
df_f18_dna = df_f18.dropna(how='any')
display(df_f18_dna.isnull().sum())

In [None]:
# state count
df_f18_dna['state'].value_counts()

In [None]:
# data concatinate
df_f18_success = df_f18_dna[df_f18_dna['state']=='successful']
df_f18_failed = df_f18_dna[df_f18_dna['state']=='failed']

df_f18_SorF = pd.concat([df_f18_success,df_f18_failed])

df_f18_SorF['state'].value_counts()

In [None]:
# data string -> integer
le = LabelEncoder()

le = le.fit(df_f18_SorF['category'])
df_f18_SorF['category'] = le.transform(df_f18_SorF['category'])

le = le.fit(df_f18_SorF['main_category'])
df_f18_SorF['main_category'] = le.transform(df_f18_SorF['main_category'])

le = le.fit(df_f18_SorF['currency'])
df_f18_SorF['currency'] = le.transform(df_f18_SorF['currency'])

le = le.fit(df_f18_SorF['state'])
df_f18_SorF['state'] = le.transform(df_f18_SorF['state'])

le = le.fit(df_f18_SorF['country'])
df_f18_SorF['country'] = le.transform(df_f18_SorF['country'])

df_f18_SorF.head()

In [None]:
# 特徴量の選出（仮説country）
df_f18_S = df_f18_SorF[df_f18_SorF['state']==1]
df_f18_F = df_f18_SorF[df_f18_SorF['state']==0]

plt.hist([df_f18_S["country"], df_f18_F["country"]], stacked=False)
plt.show() 

In [None]:
# 特徴量の選出（仮説category,main_category）
df_f18_S = df_f18_SorF[df_f18_SorF['state']==1]
df_f18_F = df_f18_SorF[df_f18_SorF['state']==0]

plt.hist([df_f18_S["category"], df_f18_F["category"]], stacked=False)
plt.show()

plt.hist([df_f18_S["main_category"], df_f18_F["main_category"]], stacked=False)
plt.show() 

In [None]:
# 特徴量の選出（仮説期間）
df_f18_SorF['deadline'] = pd.to_datetime(df_f18_SorF['deadline'], errors = 'coerce')
df_f18_SorF['launched'] = pd.to_datetime(df_f18_SorF['launched'], errors = 'coerce')

df_f18_SorF['period'] = df_f18_SorF['deadline'] - df_f18_SorF['launched']

days = [] 
for i in df_f18_SorF.index:
    days.append(df_f18_SorF['period'][i].days)
 
df_f18_SorF['days'] = days

In [None]:
# 特徴量の選出（仮説期間）
df_f18_S = df_f18_SorF[df_f18_SorF['state']==1]
df_f18_F = df_f18_SorF[df_f18_SorF['state']==0]

plt.hist([df_f18_S["days"], df_f18_F["days"]], stacked=False)
plt.show()

In [None]:
# データを格納
X = df_f18_SorF[[ 'category', 'main_category',  'country','days']].values

In [None]:
# 正解ラベルの格納
y = df_f18_SorF['state'].values

In [None]:
# datasetの分割
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)
print("訓練用データの個数")
print(X_train.shape[0])
print("試験用データの個数")
print(X_test.shape[0])

In [None]:
# モデルの作成（ロジスティック回帰）
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, fit_intercept=True, random_state=1234)
clf.fit(X_train, y_train)

In [None]:
# 予測（検証データ）
y_pred = clf.predict(X_test)

In [None]:
# 予測結果の評価
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(fn, fp)
print(tn, tp)

In [None]:
# 'Accuracy、Recall、Precisionを求めて表示
print('Accuracy  = {:.3f}%'.format(100 * (tp+tn)/(tp+fp+fn+tn)))
print('Recall    = {:.3f}%'.format(100 * (tp)/(tp+fn)))
print('Precision = {:.3f}%'.format(100 * (tp)/(tp+fp))) 

In [None]:
# モデルの改善
clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, class_weight = {0:0.25,1:0.75},fit_intercept=True, random_state=1234)
clf.fit(X_train, y_train)

In [None]:
# 改善モデルでの予測
y_pred = clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(fn, fp)
print(tn, tp)

In [None]:
# 改善モデルで 'Accuracy、Recall、Precisionを求めて表示
print('Accuracy  = {:.3f}%'.format(100 * (tp+tn)/(tp+fp+fn+tn)))
print('Recall    = {:.3f}%'.format(100 * (tp)/(tp+fn)))
print('Precision = {:.3f}%'.format(100 * (tp)/(tp+fp))) 

In [None]:
# 特徴量の追加による改善(goal)
X = df_f18_SorF[[ 'goal','category', 'main_category',  'country','days']].values

y = df_f18_SorF['state'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)

clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, class_weight = {0:0.4,1:0.6},fit_intercept=True, random_state=1234)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(fn, fp)
print(tn, tp)

print('Accuracy  = {:.3f}%'.format(100 * (tn+tp)/(tn+fp+fn+tp)))
print('Recall    = {:.3f}%'.format(100 * (tp)/(fn+tp)))
print('Precision = {:.3f}%'.format(100 * (tp)/(fp+tp))) 

In [None]:
# 特徴量の追加による改善(backers)

X = df_f18_SorF[['backers','goal','category', 'main_category',  'country','days']].values

y = df_f18_SorF['state'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=42)

clf = SGDClassifier(loss='log', penalty='none', max_iter=10000, class_weight = {0:0.4,1:0.6},fit_intercept=True, random_state=1234)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(fn, fp)
print(tn, tp)

print('Accuracy  = {:.3f}%'.format(100 * (tn+tp)/(tn+fp+fn+tp)))
print('Recall    = {:.3f}%'.format(100 * (tp)/(fn+tp)))
print('Precision = {:.3f}%'.format(100 * (tp)/(fp+tp))) 