<a href="https://colab.research.google.com/github/hiro-Anzai/tigerfish/blob/master/hello_python_No2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **第二回～機械学習の基礎**

# 1.CSVを読み込んで内容を確認しましょう

In [60]:
#データ加工・処理
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

##1.1 CSVファイル読み込みます

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
#trainデータを読み込む
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#testデータを読み込む
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


##1.2 配列の形状

In [5]:
#配列の形状を確認
test_shape = test.shape
train_shape = train.shape
print(test_shape)
print(train_shape)

(418, 11)
(891, 12)


## 1.3基本統計量

In [6]:
#基本統計量
test.describe()
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# 2.データセットの欠損の確認

In [7]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# 3.前処理（データセットの事前処理）

## 3.0不要なカラムの削除

In [9]:
#不要なカラムの削除
train = train.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["PassengerId","Name", "Ticket", "Cabin"], axis=1)

## 3.1欠損データを代理データに入れ替える

In [11]:
#埋める前ではS,C,Q,nanの4種類あることを確認
train["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [12]:
#Embarkedの欠損を最頻値で埋める
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode().iloc[0])

In [14]:
#埋めた後ではS,C,Qの3種類あることを確認
train["Embarked"].unique()

array(['S', 'C', 'Q'], dtype=object)

In [13]:
#Ageの欠損を中央値で埋める
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = test["Age"].fillna(test["Age"].median())

In [15]:
#欠損値がなくなったか確認
train["Age"].isnull().sum()

0

In [16]:
#欠損値がなくなったか確認
test["Age"].isnull().sum()

0

In [17]:
#Fareを中央値で埋める
test["Fare"] = test["Fare"].fillna(test["Fare"].median()) 

In [18]:
#欠損がなくなったか確認
test["Fare"].isnull().sum()

0

In [20]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [21]:
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

##3.2 文字列カテゴリカルデータを数字へ変換

In [23]:
#データ型の確認
train.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [24]:
#naleを0に、femaleを1に変換
train["Sex"] = train["Sex"].map({"male":0, "female":1})
test["Sex"] = test["Sex"].map({"male":0, "female":1})

In [25]:
#Embarkedをダミー変数化
train = pd.get_dummies(train, columns=["Embarked"])
test = pd.get_dummies(test, columns=["Embarked"])

In [26]:
#データ型の確認
train.dtypes

Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked_C      uint8
Embarked_Q      uint8
Embarked_S      uint8
dtype: object

## 3.3訓練データとテストデータの分割

In [48]:
#訓練データとテストデータに分割
train_set, test_set = train_test_split(train, test_size=0.2, random_state=1)

x_train = train_set.iloc[:, 1:]
y_train = train_set.iloc[:, 0]

x_test = test_set.iloc[:, 1:]
y_test = test_set.iloc[:, 0]

# 4.予測モデル その1 「決定木」

In [49]:
#決定木モデルの作成
k=5
clf1 = DecisionTreeClassifier(max_depth=k)
clf1 = clf1.fit(x_train, y_train)

In [50]:
#テストデータの正解率
print('正解率(train):{:.3f}'.format(clf1.score(x_train, y_train)))

正解率(train):0.862


In [51]:
#テストデータの正解率
print('正解率(test):{:.3f}'.format(clf1.score(x_test, y_test)))

正解率(test):0.777


**演習問題1**
max_depthの値を変えて精度の変化を調べてみましょう

# 5.予測モデル その2 「kNN」

In [52]:
#データやモデルを構築するためのライブラリ等のインポート
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [57]:
#学習
k=5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [58]:
print('正解率(train):{:.3f}'.format(knn.score(x_train, y_train)))

正解率(train):0.806


In [59]:
print('正解率(test):{:.3f}'.format(knn.score(x_test, y_test)))

正解率(test):0.682


演習問題2 n_neighborsの値を変えて精度の変化を調べてみましょう