# 資料探戡：邏輯迴歸

使用 Kaggle 資料集：

*   [titanic.zip](https://drive.google.com/file/d/12MHLc51x2JK_cYtCnynIn03KTEh61WES/view?usp=share_link)




In [None]:
# 下載鐵達尼號壓縮檔
# CO2
!wget --no-check-certificate "https://drive.google.com/uc?export=download&id=12MHLc51x2JK_cYtCnynIn03KTEh61WES" -O titanic.zip
!unzip titanic.zip

In [None]:
# 讀入資料集 -- titanic training & test set
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_gender_submission = pd.read_csv('gender_submission.csv')

In [None]:
print(df_train['Age'])

In [None]:
df_train['CAT_GEN'] = df_train['Sex'].eq('female').astype(int)

In [None]:
print(df_train['Sex'])

In [None]:
print(df_train[['CAT_GEN', 'Sex']])

In [None]:
# 看看 df_train
# 有人年齡沒有填! 這筆資料不能用~~~去除無效資料
df_train = df_train[df_train['Age'].notnull()]

df_train.head()

In [None]:
# 看看 df_test
#df_test = df_test[df_test['Age'].notnull()]
df_test.head()

In [None]:
# 看看 df_gender_submission
df_gender_submission.head()

In [None]:
# 為了檢查並確保 Age 不為空值,我們先把 df_test and df_gender_submission 合在一起
df_new = pd.concat( [df_test, df_gender_submission], axis=1 )
df_new.head()

In [None]:
# 拿到新的 df_test
df_new = df_new[df_new['Age'].notnull()]
df_new = df_new[df_new['Fare'].notnull()]
df_test = df_new[df_new['Pclass'].notnull()]
df_test['CAT_GEN'] = df_test['Sex'].eq('female').astype(int)
df_test.head()

In [None]:
print(df_test)

In [None]:
df_train.info()

## To live or NOT to live, that is the question !

In [None]:
# 大家覺得什麼因素，是第一個想放進去建模看看的？
# 所以要選哪些Ｘ？
# (1) Age
# (2) Sex
# (3) Pclass
# (4) Cabin(可自行嘗試增加)
# Y 又是哪個?
# Survived
interest_columns = ['Age', 'Fare', 'Pclass', 'CAT_GEN'] # 'PassengerId'
X_train = df_train[interest_columns]
Y_train = df_train[['Survived']]
print('Samples of X:', X_train.head())
print('Sample of Y:', Y_train.head())

In [None]:
print(df_test['Fare'].isnull()==True)

In [None]:
# 測試集也來一下
X_test = df_test[interest_columns]
Y_test = df_test[['Survived']]
print('Samples of X:', X_test.head())
print('Sample of Y:', Y_test.head())

### 邏輯迴歸


*   建立模型（估計參數）
*   使用模型預測
*   檢查係數
*   計算殘差平方和 (Residuals Sum of Square)


In [None]:
from sklearn.linear_model import LogisticRegression

# 1: 建立 logistic regression 模型
model_logreg = LogisticRegression()

# 訓練模型
model_logreg.fit(X_train, Y_train)

# 2: 預測測試集的機率
y_pred = model_logreg.predict(X_test)

# 3. 檢查係數
print('B0 of LR', model_logreg.intercept_)
print('B1 of LR', model_logreg.coef_)

#['Age', 'Fare', 'Pclass', 'CAT_GEN']

In [None]:
#4 : 計算殘差平方和
from sklearn.metrics import mean_squared_error

#先看訓練集的分數
y_pred_train = model_logreg.predict(X_train)
print('Official of RSS [training set]:', mean_squared_error(Y_train, y_pred_train) * len(Y_train))
print('模型分數[training set]:',model_logreg.score(X_train, Y_train))

In [None]:
#再看測試集的分數
y_pred_test = model_logreg.predict(X_test)

print('Official of RSS:', mean_squared_error(Y_test, y_pred) * len(Y_test))
print('模型分數:',model_logreg.score(X_test, Y_test))

## Quiz - 2: 以上建模，你執行完成了嗎？請解讀建模係數，或嘗試加入更多欄位（然後重新建模），以回答哪個是對生還機率影響最大的變因。

In [None]:
# 你覺得你的模型的預測效果好嗎?
# 是否也可以由這些項目加強?
# (1) 加入其他欄位
# (2) 正規化Ｘ，Ｙ

In [None]:
# 大家要選哪些Ｘ是想放進去建模看看的？
# (1) Age
# (2) Sex
# (3) Pclass
# (4) Cabin(可自行嘗試增加)
# Y 又是Survived
for i in df_train['Sex'] :
  if i == 'male' :
    df_train['Sex Code'] = 1
  else :
    df_train['Sex Code'] = 0
df_train.drop(['Sex'], axis = 1)

interest_columns = ['Age', 'Sex Code', 'Pclass', 'CAT_GEN'] # 'PassengerId'
X_train = df_train[interest_columns]
Y_train = df_train[['Survived']]


print('Samples of X:', X_train.head())
print('Sample of Y:', Y_train.head())

In [None]:
# 測試集也來一下
for i in df_test['Sex'] :
  if i == 'male' :
    df_test['Sex Code'] = 1
  else :
    df_test['Sex Code'] = 0
df_test.drop(['Sex'], axis = 1)

X_test = df_test[interest_columns]
Y_test = df_test[['Survived']]

print('Samples of X:', X_test.head())
print('Sample of Y:', Y_test.head())

In [None]:
from sklearn.linear_model import LogisticRegression

# 1: 建立 logistic regression 模型
model_logreg = LogisticRegression()

# 訓練模型
model_logreg.fit(X_train, Y_train)

# 2: 預測測試集的機率
y_pred = model_logreg.predict(X_test)

# 3. 檢查係數
print('B0 of LR', model_logreg.intercept_)
print('B1 of LR', model_logreg.coef_)

#['Age', 'Sex', 'Pclass', 'CAT_GEN']
#4 : 計算殘差平方和
from sklearn.metrics import mean_squared_error

#先看訓練集的分數
y_pred_train = model_logreg.predict(X_train)
print('Official of RSS [training set]:', mean_squared_error(Y_train, y_pred_train) * len(Y_train))
print('模型分數[training set]:',model_logreg.score(X_train, Y_train))


y_pred_test = model_logreg.predict(X_test)

print('Official of RSS:', mean_squared_error(Y_test, y_pred) * len(Y_test))
print('模型分數:',model_logreg.score(X_test, Y_test))