# 第3部 Data Types
## 第9章 欠損データ
### 9.1 NaNとは何か

In [None]:
from numpy import NaN, NAN, nan
print("NaN は True と等しくない", NaN == True)
print("NaN は 0 と等しくない", NaN == 0)
print("NaN は '' と等しくない", NaN == "")
print("NaN は NaN と等しくない", NaN == NaN)
print("NaN は NAN と等しくない", NaN == NAN, "\n")

print("NaN は nan と等しくない", NaN == nan)
print("nan は NAN と等しくない", nan == NAN)

In [None]:
from numpy import NaN, NAN, nan
import pandas as pd
print("欠損値の判定", pd.isnull(NaN))
print("欠損値の判定", pd.isnull(nan))
print("欠損値の判定", pd.isnull(NAN), "\n")

print("非欠損値の判定", pd.notnull(NaN))
print("非欠損値の判定", pd.notnull(42))
print("非欠損値の判定", pd.notnull("missing"))

In [None]:
from numpy import NaN, NAN, nan
import polars as pl
# polarsはスカラー値のNaN判定は無い
s = pl.Series("Check missing values", [NaN, nan, NAN, "missing"])
display("元データの確認", s)
display("欠損値の判定", s.is_null())
display("非欠損値の判定", s.is_not_null())

### 9.2 欠損値はどこから来るのか
#### 9.2.1 データのロード

In [None]:
import pandas as pd
visited_file = "../data/survey_visited.csv"
display("読み込んだファイルの元データ", pd.read_csv(visited_file))

display("欠損値をNaNに変更しない", pd.read_csv(visited_file, keep_default_na = False))

display("空白をNaNにするパラメータとの競合を確認する", pd.read_csv(visited_file, na_values = [""], keep_default_na = False))

In [None]:
import polars as pl
visited_file = "../data/survey_visited.csv"
display("読み込んだファイルの元データ", pl.read_csv(visited_file))

display("欠損値をNaNに変更しない", pl.read_csv(visited_file, missing_utf8_is_empty_string = True))

display("空白をNaNにするパラメータとの競合を確認する", pl.read_csv(visited_file, null_values = [""], missing_utf8_is_empty_string = True))

#### 9.2.2 マージ後のデータ

In [None]:
import pandas as pd
visited = pd.read_csv("../data/survey_visited.csv")
survey = pd.read_csv("../data/survey_survey.csv")
display("訪問地の元データ", visited)
display("調査地の元データ", survey)

vs = visited.merge(survey, left_on = "ident", right_on = "taken")
display("結合後のデータ", vs)

In [None]:
import polars as pl
visited = pl.read_csv("../data/survey_visited.csv")
survey = pl.read_csv("../data/survey_survey.csv")
display("訪問地の元データ", visited)
display("調査地の元データ", survey)

vs = visited.join(survey, left_on = "ident", right_on = "taken")
display("結合後のデータ", vs)

#### 9.2.3 ユーザー入力

In [None]:
import pandas as pd
from numpy import NaN, NAN, nan
num_legs = pd.Series({"goat": 4, "amoeba": nan})
display("元データ", num_legs)

scientists = pd.DataFrame({
    "Name": ["Rosaline Franklin", "William Gosset"],
    "Occupation": ["Chemist", "Statistician"],
    "Born": ["1920-07-25", "1876-06-13"],
    "Died": ["1958-04-16", "1937-10-16"],
    "missing": [NaN, nan]
    })
display("欠損値有のDataFrame", scientists)

print("先のDataFrameのデータ型", scientists.dtypes, "\n")


scientists = pd.DataFrame({
    "Name": ["Rosaline Franklin", "William Gosset"],
    "Occupation": ["Chemist", "Statistician"],
    "Born": ["1920-07-25", "1876-06-13"],
    "Died": ["1958-04-16", "1937-10-16"]
    })
scientists["missing"] = nan
display("全要素が欠損値の列を加えたのDataFrame", scientists)

In [None]:
import polars as pl
from numpy import NaN, NAN, nan
# polarsにはindexが無いのでこれはコメントアウトする
#num_legs = pl.Series({"goat": 4, "amoeba": nan})
#display("元データ", num_legs)

scientists = pl.DataFrame({
    "Name": ["Rosaline Franklin", "William Gosset"],
    "Occupation": ["Chemist", "Statistician"],
    "Born": ["1920-07-25", "1876-06-13"],
    "Died": ["1958-04-16", "1937-10-16"],
    "missing": [NaN, nan]
    })
display("欠損値有のDataFrame", scientists)

print("先のDataFrameのデータ型", scientists.dtypes, "\n")


scientists = pl.DataFrame({
    "Name": ["Rosaline Franklin", "William Gosset"],
    "Occupation": ["Chemist", "Statistician"],
    "Born": ["1920-07-25", "1876-06-13"],
    "Died": ["1958-04-16", "1937-10-16"]
    })
#scientists["missing"] = nan
scientists = (
    scientists.with_columns([
        pl.Series("missing", [nan] * len(scientists))
    ])
)
display("全要素が欠損値の列を加えたのDataFrame", scientists)

#### 9.2.4 インデックスの振りなおし
polarsには欠損値が無いので省略
### 9.3 欠損データの扱い
#### 9.3.1 欠損データを数える

In [None]:
import pandas as pd

ebola = pd.read_csv("../data/country_timeseries.csv")
display("非欠損値の総数を求める", ebola.count())

num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
display("欠損データの数", num_missing)

import numpy as np
display("データに含まれている欠損値の数", np.count_nonzero(ebola.isnull()))
display("データに含まれている欠損値の数", np.count_nonzero(ebola["Cases_Guinea"].isnull()))

cnts = ebola["Cases_Guinea"].value_counts(dropna = False)
display("データに含まれている欠損値の数", cnts)

display("欠損値の数", ebola["Cases_Guinea"].isnull().sum())

In [None]:
import polars as pl

ebola = pl.read_csv("../data/country_timeseries.csv")
# not_null_countは無い
#display("非欠損値の総数を求める", ebola.count())

#num_rows = ebola.shape[0]
#num_missing = num_rows - ebola.count()
display("欠損データの数", ebola.null_count())

import numpy as np
#良くわからなかった
#display("データに含まれている欠損値の数", np.count_nonzero(ebola.is_null()))
#display("データに含まれている欠損値の数", np.count_nonzero(ebola["Cases_Guinea"].is_null()))

cnts = ebola["Cases_Guinea"].value_counts(sort = True)
display("データに含まれている欠損値の数", cnts)

display("欠損値の数", ebola["Cases_Guinea"].is_null().sum())

#### 9.3.2 欠損データのクリーニング
#### 9.3.2.1 欠損値の変換

In [None]:
import pandas as pd
ebola = pd.read_csv("../data/country_timeseries.csv")
display("欠損値を 0 で埋め、最初の5列を見る", ebola.fillna(0).iloc[:, 0:5])

#### 9.3.2.2 前の値で置換する(Forward Fill)

In [None]:
import pandas as pd
ebola = pd.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.fillna(method = "ffill").iloc[:, 0:5])

In [None]:
import polars as pl
ebola = pl.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.fill_null(strategy = "forward")[:, 0:5])

#### 9.3.2.3 後の値で置換する(Backward Fill)

In [None]:
import pandas as pd
ebola = pd.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.fillna(method = "bfill").iloc[:, 0:5])

In [None]:
import polars as pl
ebola = pl.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.fill_null(strategy = "backward")[:, 0:5])

#### 9.3.2.4 補間

In [None]:
import pandas as pd
ebola = pd.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.interpolate().iloc[:, 0:5])

In [None]:
import polars as pl
ebola = pl.read_csv("../data/country_timeseries.csv")
display("欠損値を 前方の値 で埋め、最初の5列を見る", ebola.interpolate()[:, 0:5])

#### 9.3.3 欠損データのある計算

In [None]:
import pandas as pd
ebola = pd.read_csv("../data/country_timeseries.csv")
ebola["Cases_multiple"] = (
    ebola["Cases_Guinea"]
    + ebola["Cases_Liberia"]
    + ebola["Cases_SierraLeone"]
)

ebola_subset = ebola.loc[:,
                         [
                             "Cases_Guinea",
                             "Cases_Liberia",
                             "Cases_SierraLeone",
                             "Cases_multiple"
                         ]]
display("複数の地域における出現回数を含む新しい列を含むDataFrameの先頭10行", ebola_subset.head(n = 10))

print("欠損値を省略して、出現回数の和を計算する", ebola["Cases_Guinea"].sum(skipna = True))
print("欠損値を省略せず、出現回数の和を計算する", ebola["Cases_Guinea"].sum(skipna = False))

In [None]:
import polars as pl
ebola = pl.read_csv("../data/country_timeseries.csv")
ebola = (
    ebola.with_columns([
        (ebola["Cases_Guinea"]
         + ebola["Cases_Liberia"]
         + ebola["Cases_SierraLeone"]).alias("Cases_multiple")
         ])
         )

ebola_subset = ebola[:,
                         [
                             "Cases_Guinea",
                             "Cases_Liberia",
                             "Cases_SierraLeone",
                             "Cases_multiple"
                         ]]
display("複数の地域における出現回数を含む新しい列を含むDataFrameの先頭10行", ebola_subset.head(n = 10))

# polars には skipna = True はない
print("欠損値を省略して、出現回数の和を計算する", ebola["Cases_Guinea"].sum())
#print("欠損値を省略せず、出現回数の和を計算する", ebola["Cases_Guinea"].sum(skipna = False))

### 9.4 pandas組込みのNA(欠損値)
省略
### 9.5 まとめ
省略