# Great expectations

In [1]:
import great_expectations as ge

In [2]:
# export
from fastscript import call_parse, Param, bool_arg
from scipy import ndimage

import h5py
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pydicom
import scipy.io as spio

In [3]:
from I2T2.io import dicom_dataframe

In [4]:
import great_expectations as ge

In [5]:
data_path = '../data/knee/'

### Create pandas df with tag names as columns and images as rows. Values are elements

In [6]:
#create pandas df
dcm_df = dicom_dataframe(path_to_dicom_dir=data_path, dicom_extension='dcm')
all_columns_pd = pd.DataFrame([dcm_df.dataframe['DS'].iloc[0]]).T
for i in range(1,len(dcm_df.dataframe['DS'])):
    all_columns_pd[i] = pd.DataFrame([dcm_df.dataframe['DS'].iloc[i]]).T
all_columns_pd = all_columns_pd.T

#rename columns
for col in all_columns_pd.columns:
    new_column_name = all_columns_pd.iloc[0][col].name
    all_columns_pd = all_columns_pd.rename(columns={col:new_column_name})
    
    for index, row in all_columns_pd.iterrows():
        row[new_column_name] = row[new_column_name].value

#add pixel array, which is not part of the original DS
all_columns_pd['pixel_array'] = all_columns_pd['Pixel Data']
for index, row in all_columns_pd.iterrows():
    row['pixel_array'] = dcm_df.dataframe['DS'].iloc[index].pixel_array

### Create expectations DF

In [7]:
my_ge_df = ge.from_pandas(all_columns_pd)

In [8]:
pd.options.display.max_rows = 999
my_ge_df.iloc[0]

Specific Character Set                                                      ISO_IR 100
Image Type                                                  [ORIGINAL, PRIMARY, OTHER]
Instance Creation Date                                                        20070101
Instance Creation Time                                                   120000.000000
SOP Class UID                                                1.2.840.10008.5.1.4.1.1.4
SOP Instance UID                     1.2.826.0.1.3680043.8.1055.1.20111103111208347...
Study Date                                                                    20070101
Series Date                                                                   20070101
Acquisition Date                                                              20070101
Content Date                                                                  20070101
Acquisition DateTime                                                    20070101120000
Study Time                                 

### Test some expectations

In [9]:
my_ge_df.expect_column_values_to_be_identical('Instance Creation Date')

{
  "meta": {},
  "success": true,
  "result": {
    "element_count": 22,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": null
}

In [10]:
my_ge_df.expect_column_values_to_be_identical('SOP Class UID')

{
  "meta": {},
  "success": true,
  "result": {
    "element_count": 22,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": null
}

In [11]:
my_ge_df.expect_column_values_to_match_regex(column='Study Description',regex='Knee')

{
  "meta": {},
  "success": true,
  "result": {
    "element_count": 22,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": null
}

In [12]:
# my_ge_df.expect_column_values_to_have_shape(column=['pixel_array'], shape=(512,512))

In [22]:
# my_ge_df.expect_column_values_to_match_regex(column=['Scan Options'], regex='FS')

### Issues
- ~~Add `expect_column_values_to_be_identical`~~
- Add `expect_column_values_to_have_shape`
- If content of DF is a list, expectations like `expect_column_values_to_match_regex` fail