In [28]:
import numpy as np 
import pandas as pd 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split # ホールドアウト法に関する関数
from sklearn.model_selection import KFold # 交差検証法に関する関数
from sklearn.metrics import mean_absolute_error # 回帰問題における性能評価に関する関数
from sklearn.linear_model import LinearRegression

In [29]:
%matplotlib inline
#import pandas as pd
#import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D #3D散布図の描画

# lib model
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, precision_recall_fscore_support

# lib 前処理
from sklearn.model_selection import train_test_split # ホールドアウト法に関する関数
from sklearn.model_selection import KFold # 交差検証法に関する関数
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [30]:
df = pd.read_csv(r"C:/Users/N388/Desktop/kickstarter-projects (1)/ks-projects-201801.csv")

In [31]:
#データをランダムに抽出(元データが大きすぎるため)
df=df.sample(frac=0.1,random_state=1234)

In [32]:
#欠損値削除
df= df.dropna(how='any')

In [33]:
y_col = 'state'

x_cols = ['category','main_category','currency','country']

#説明変数を、ダミー変数にする
X = pd.get_dummies(df[x_cols])#drop_first=True

#目的変数を successfulのフラグに変更
y = pd.get_dummies(df[y_col])['successful']

In [34]:
# 全データのうち、20%をテストデータにする
test_size = 0.2

# データを分割（テストデータはランダム選択）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1234) 

In [35]:
#ロジスティック回帰  
clf = SGDClassifier(loss='log', penalty='none', max_iter=100, fit_intercept=True, random_state=1234)

#clf.fit(X_test, y_test) 使わない
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='none', power_t=0.5,
              random_state=1234, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [36]:
# ラベルを予測
y_est = clf.predict(X)

# 対数尤度
print('対数尤度 = {:.3f}'.format(- log_loss(y, y_est)))

# 正答率
print('正答率 = {:.3f}%'.format(100 * accuracy_score(y, y_est)))

対数尤度 = -11.187
正答率 = 67.610%


In [37]:
#混同行列を作成
tn, fp, fn, tp = confusion_matrix(y, y_est).ravel()
print(fn, fp)
print(tn, tp)

#'Accuracy、Recall、Precisionを求めて表示
print('Accuracy  = {:.3f}%'.format(100 * (tn+tp)/(tn+fp+fn+tp)))
print('Recall    = {:.3f}%'.format(100 * (tp)/(fn+tp)))
print('Precision = {:.3f}%'.format(100 * (tp)/(fp+tp))) 

8555 3591
20515 4838
Accuracy  = 67.610%
Recall    = 36.123%
Precision = 57.397%


In [38]:
mae = mean_absolute_error(y, y_est)
print("MAE = %s"%round(mae, 3))#roundで少数点以下を制御
print()

MAE = 58.271



In [39]:
#############クロスバリデーション#############

In [40]:
#ダミー変数に変換したX,yの要素を取得
X = X.values
y = y.values

In [41]:
# X = X.reshape(-1,1) # scikit-learnに入力するために整形
n_split = 5 # グループ数を設定（今回は5分割）

In [42]:
X

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [43]:
cross_valid_mae = 0
split_num = 1

In [44]:
# テスト役を交代させながら学習と評価を繰り返す #KFoldに注意
for train_idx, test_idx in KFold(n_splits=n_split, shuffle=True, random_state=1234).split(X, y):#Kfold インデックスを生成する
    X_train, y_train = X[train_idx], y[train_idx] #学習用データ
    X_test, y_test = X[test_idx], y[test_idx]     #テスト用データ
    
    # 学習用データを使って線形回帰モデルを学習
    regr = LinearRegression(fit_intercept=True)
    regr.fit(X_train, y_train)

    # テストデータに対する予測を実行
    y_pred_test = regr.predict(X_test)
    
    # テストデータに対するMAEを計算
    mae = mean_absolute_error(y_test, y_pred_test)
    print("Fold %s"%split_num)
    print("MAE = %s"%round(mae, 3))#roundで少数点以下を制御
    print()
    
    cross_valid_mae += mae #後で平均を取るためにMAEを加算
    split_num += 1

# MAEの平均値を最終的な汎化誤差値とする
final_mae = cross_valid_mae / n_split #n_slit=5=グループ数
print("Cross Validation MAE = %s"%round(final_mae, 3))

Fold 1
MAE = 1605592711.362

Fold 2
MAE = 0.417

Fold 3
MAE = 0.416

Fold 4
MAE = 0.419

Fold 5
MAE = 0.418

Cross Validation MAE = 321118542.606


In [45]:
#############   SVM   ##############

In [46]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
#from common_func import plot_decision_regions
import seaborn as sns

In [47]:
# SVMの実行
C = 1
kernel = "rbf"#ガウスカーネル
gamma = 1
clf = SVC(C=C, kernel=kernel, gamma=gamma)
clf.fit(X_train, y_train)

print("識別精度=",clf.score(X_test, y_test))


識別精度= 0.6714228563808508


In [48]:
#####正則化#####

In [49]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [50]:
# LassoCVを使って、正則化の強さを自動決定したモデル（estimator）を作成
estimator = LassoCV(normalize=True, cv=10)

# モデルの情報を使って特徴選択を行うためSelectFromModelを使う
# 今回は係数がしきい値（threshold）が1e-5以下である特徴を削除する
sfm = SelectFromModel(estimator, threshold=1e-5)

# fitで特徴選択を実行
sfm.fit(X_train, y_train)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


SelectFromModel(estimator=LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001,
                                  fit_intercept=True, max_iter=1000,
                                  n_alphas=100, n_jobs=None, normalize=True,
                                  positive=False, precompute='auto',
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, verbose=False),
                max_features=None, norm_order=1, prefit=False, threshold=1e-05)

In [60]:
# 削除すべき特徴の名前を取得 
removed_idx = ~sfm.get_support()
removed_idx

array([False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False,  True, False,  True,
       False, False,  True, False, False, False,  True, False, False,
       False,  True,  True, False, False, False, False, False, False,
        True, False, False, False,  True, False, False,  True, False,
       False,  True, False, False,  True, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
        True, False, False, False,  True,  True, False, False,  True,
        True,  True, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False,
        True,  True, False, False,  True, False, False, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False, False,

In [62]:
X = pd.get_dummies(df[x_cols])#drop_first=True

In [63]:
X.columns[removed_idx] 

Index(['category_Audio', 'category_Children's Books', 'category_Comedy',
       'category_Comics', 'category_Cookbooks', 'category_Crochet',
       'category_Design', 'category_Digital Art', 'category_Experimental',
       'category_Fantasy', 'category_Fashion', 'category_Film & Video',
       'category_Food', 'category_Footwear', 'category_Graphic Design',
       'category_Jewelry', 'category_Letterpress', 'category_Makerspaces',
       'category_Metal', 'category_Movie Theaters', 'category_Music',
       'category_Music Videos', 'category_Painting', 'category_Photography',
       'category_Poetry', 'category_Pop', 'category_Printing',
       'category_Puzzles', 'category_Quilts', 'category_Robots',
       'category_Sculpture', 'category_Small Batch', 'category_Sound',
       'category_Stationery', 'category_Taxidermy', 'category_Textiles',
       'category_Thrillers', 'category_Vegan', 'category_Wearables',
       'category_Weaving', 'category_Webcomics', 'category_Woodworking',
    