# 前処理

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
#scvファイルの読み込み
livestock = pd.read_csv("../data/livestock.csv")
cultured = pd.read_csv("../data/cultured.csv")

In [3]:
data_pre = pd.concat([livestock,cultured],sort=False)
data = data_pre.reset_index(drop=True)

In [4]:
# 新しい説明変数を作成する

#一人当たり支援額
data["一人当たり支援額"] = round(data["支援総額"]/(data["支援者数"]+1))
#最高額支援ダミー:最高支援額と支援額上限が一致するか 0:不一致 1:一致
data["最高額支援ダミー"] = 0
data.loc[data['最高支援額']==data['支援額上限'],'最高額支援ダミー'] = 1
#最低額返礼品ダミー:支援額下限と返礼品ありの下限が一致するか　0:不一致　1:一致
data["最低額返礼品ダミー"] = 0
data.loc[data['支援額下限']==data['返礼品ありの下限'],'最低額返礼品ダミー'] = 1
#自然災害ダミー:災害を示すような言葉が含まれているか
data["自然災害ダミー"] = 0
for i in range(len(data)):
    if "震災" in data["プロジェクト名"][i]:
        data["自然災害ダミー"][i] = 1
    elif "豪雨" in data["プロジェクト名"][i]:
        data["自然災害ダミー"][i] = 1
    elif "台風" in data["プロジェクト名"][i]:
        data["自然災害ダミー"][i] = 1
#畜産ダミー:畜産か耕種か　畜産:1 耕種:0
data["畜産ダミー"] = 0
data["畜産ダミー"][:110] = 1

#target値を作成　0:失敗　1:成功
data["achievement"] = data["支援総額"] // data["目標金額"]
data["target"] = 0
data.loc[data['achievement']>=1,'target'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["自然災害ダミー"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["自然災害ダミー"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["自然災害ダミー"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["畜産ダミー"][:110] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the 

In [5]:
delate_col = ["プロジェクト名","URL","CFP","支援総額","産業","最高支援額","最低支援額","achievement"
              ,"一人当たり支援額"]
data = data.drop(columns=delate_col)

In [6]:
data.head()

Unnamed: 0,目標金額,支援者数,画像数,動画数,文字数,活動報告回数,リターン種類数,支援額上限,支援額下限,返礼品ありの下限,最高額支援ダミー,最低額返礼品ダミー,自然災害ダミー,畜産ダミー,target
0,1000000,71,8,0,2566,3,9,50000,5000,5000,1,1,0,1,1
1,1000000,72,22,0,4010,5,12,100000,3000,3000,1,1,0,1,1
2,1000000,91,9,1,4244,27,12,100000,5000,5000,1,1,0,1,1
3,600000,79,10,0,3500,13,5,15000,5000,10000,0,0,0,1,1
4,1000000,116,18,0,3457,17,6,300000,3000,10000,0,0,1,1,1


In [7]:
import statsmodels.api as sm
fold_y = data["target"]
x = data[["目標金額","支援者数"]]
fold_x = sm.add_constant(x)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(fold_x,fold_y,test_size=0.2,random_state=42)

In [36]:
model = sm.Logit(y_train, X_train)
results = model.fit()
pred = results.predict(X_test)
result = [1 if i>0.5 else 0 for i in pred]
y_test_re = y_test.reset_index(drop=True)
count=0
for i in range(len(y_test)):
    if y_test_re[i] == result[i]:
        count+=1
print(count/len(y_test))

Optimization terminated successfully.
         Current function value: 0.355907
         Iterations 9
0.8776978417266187


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
stratifiedfold = StratifiedKFold(n_splits=5)
logreg = LogisticRegression()
scores = cross_val_score(logreg,fold_x,fold_y,cv=stratifiedfold)
print(f'LogisticRegression:{scores.mean()}')

LogisticRegression:0.8615160045876342


In [8]:
# def train_test(x, y, k, r):
#     idx = np.arange(0, x.shape[0])
#     idx_test = idx[np.fmod(idx, k) == r]
#     idx_train = idx[np.fmod(idx, k) != r]
#     x_test = x.iloc[idx_test,:]
#     x_train = x.iloc[idx_train,:]
#     y_test = y[idx_test]
#     y_train = y[idx_train]
#     return x_train, x_test, y_train, y_test

In [9]:
# for r in range(5):
#     x_train, x_test, y_train, y_test = train_test(fold_x, fold_y, 5, r)
#     print("r = {}".format(r))
#     print("x_train:\n{}".format(x_train))
#     print("x_test:\n{}".format(x_test))
#     print("y_train:\n{}".format(y_train))
#     print("y_test:\n{}\n".format(y_test))

In [10]:
# count = 0
# for r in range(5):
#     x_train, x_test, y_train, y_test = train_test(fold_x, fold_y, 5, r)
#     model = sm.Logit(y_train, x_train)
#     results = model.fit()
#     pred = results.predict(x_test)
#     result = [1 if i>0.5 else 0 for i in pred]
#     y_test_re = y_test.reset_index(drop=True)
#     for i in range(len(x_test)):
#         if y_test[i] == result[i]:
#             count += 1
#     print(count)

In [11]:
# type(y_test)

In [12]:
# y_test[4]

In [13]:
# model = sm.Logit(y_train, x_train)

In [14]:
# import statsmodels.api as sm
# model = sm.Logit(fold_y, fold_x)
# results = model.fit()
# print(results.params)

In [15]:
# pred = results.predict(fold_x)

In [16]:
# result=[]
# for i in pred:
#     if i > 0.5:
#         result.append(1)
#     else:
#         result.append(0)

# print(result)

In [17]:
# list = [1 if i>0.5 else 0 for i in pred]
# print(list)

# 交差検証

In [18]:
# import statsmodels.api as sm
# from sklearn.model_selection import train_test_split
# fold_y = data["target"]
# x = data[["目標金額","活動報告回数","支援者数"]]
# fold_x = sm.add_constant(x)

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(fold_x,fold_y,test_size=0.2,random_state=42)

In [20]:
# model = sm.OLS(fold_y, fold_x)
# results = model.fit()
# print(results.params)

In [21]:
# # from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# model.fit(x,fold_y)
# # 回帰係数
# print(model.coef_)
# # 切片 (誤差)
# print(model.intercept_)

In [22]:
# import statsmodels.api as sm
# model = sm.Logit(fold_y, fold_x)
# results = model.fit(method="ncg", maxiter=max_iter)
# print(results.params)

In [23]:
# # from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import LogisticRegression
# logreg = LogisticRegression(fit_intercept=False,penalty='none')
# logreg.fit(fold_x,fold_y)
# # 回帰係数
# print(logreg.coef_)

In [24]:
# # from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import LogisticRegression
# logreg2 = LogisticRegression(solver='newton-cg', max_iter=max_iter, fit_intercept=True, penalty='none')
# logreg2.fit(x,fold_y)
# # 回帰係数
# print(logreg2.coef_)
# print(logreg2.intercept_)

In [25]:
# n = 200
# np.random.seed(42)

# x = np.random.randint(0, 2, size=n)
# y = (x > (0.5 + np.random.normal(0, 0.5, n))).astype(int)

# display(pd.crosstab( y, x ))


# max_iter = 100


# #### Statsmodels
# # first artificially add intercept to x, as advised in the docs:
# x_ = sm.add_constant(x)
# res_sm = sm.Logit(y, x_).fit(method="ncg", maxiter=max_iter) # x_ here
# print(res_sm.params)

# #### Scikit-Learn
# res_sk = LogisticRegression(solver='newton-cg', max_iter=max_iter, fit_intercept=True, penalty='none')
# res_sk.fit( x.reshape(n, 1), y )
# print(res_sk.intercept_, res_sk.coef_)