In [29]:
import dicom
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
import glob
import random
%matplotlib inline

# The objective is to analyse and see what we can get from the metadata of the dicom images. 

We first extract all fields except for some unique fields for which we already know the sense and are unique. Then we try to explain what the field mean for each field with more than one value, so that may need some preprocessing. 

In [None]:
from collections import defaultdict
unique_fields = ["PixelData", "ImagePositionPatient", "InstanceNumber", "PatientID", "PatientName", "SOPInstanceUID", "SliceLocation", "SeriesInstanceUID"]
value_dict = defaultdict(lambda : defaultdict(int))
all_files = glob.glob('stage1/*/*.dcm')
nb_done = 0
for f in all_files:
    nb_done += 1
    if nb_done % 1000 == 0:
        print "done %s out of %s" % (nb_done, len(all_files))
    plan = dicom.read_file(f)
    fields = plan.dir("")
    for field in fields:
        if field not in unique_fields:

            value_dict[field][str(getattr(plan, field))] += 1



In [128]:
import pprint 
for field in value_dict:
    
    print field + ": " + str(len(value_dict[field]))

FrameIncrementPointer: 1
NumberOfFrames: 1
Rows: 1
PatientOrientation: 2
TemporalPositionIndex: 1
Allergies: 1
VolumetricProperties: 1
BurnedInAnnotation: 1
PhotometricInterpretation: 1
ImageFormat: 1
ImageLocation: 1
PositionReferenceIndicator: 2
SpecificCharacterSet: 1
RescaleType: 2
Columns: 1
PixelSpacing: 332
LargestImagePixelValue: 845
PlanarConfiguration: 1
LongitudinalTemporalInformationModified: 1
WindowCenter: 58
SeriesNumber: 43
PixelPaddingValue: 7
ImageOrientationPatient: 3
HighBit: 2
AcquisitionNumber: 7
RescaleSlope: 6
SourceImageSequence: 1
SmallestImagePixelValue: 3
PregnancyStatus: 1
BitsStored: 2
SamplesPerPixel: 1
WindowWidth: 52
LossyImageCompression: 2
LossyImageCompressionRatio: 275
SeriesDescription: 1
ImageDimensions: 1
StudyInstanceUID: 1595
SOPClassUID: 1
CompressionCode: 1
BitsAllocated: 1
PixelAspectRatio: 1
RescaleIntercept: 8
PatientBirthDate: 1
FrameOfReferenceUID: 1595
KVP: 1
Laterality: 1
PixelRepresentation: 2
Modality: 1
WindowCenterWidthExplanation:

In [146]:
value_dict["PatientOrientation"]
value_dict["ImageOrientationPatient"]

defaultdict(int,
            {"['1', '0', '0', '0', '1', '0']": 109075,
             "['1.0', '0.0', '0.0', '0.0', '1.0', '0.0']": 130,
             "['1.000000', '0.000000', '0.000000', '0.000000', '1.000000', '0.000000']": 176175})

Patient Orientation represents how the patient is oriented. ['L', 'P'] signifies Left Posterior and is the classic way of orienting. The fact that this orientation is missing otherwise seems to signify that all images do have this orientation. This [blog article](http://dicomiseasy.blogspot.fr/2013/06/getting-oriented-using-image-plane.html), was helpful understanding orientation. Image orientation seems to be the same. 

In [158]:
value_dict["FrameOfReferenceUID"]
value_dict["PositionReferenceIndicator"]


defaultdict(int, {'': 121353, 'SN': 159323})

According to this [link's](http://dicom.nema.org/medical/Dicom/2015a/output/chtml/part03/sect_C.7.4.html#sect_C.7.4.1.1.2) definition, this is the anatomical point which was use as a reference (i.e, the first point). SN seems to mean Suprasternal notch, and is the starting point of scans. Frame of reference Id is the id of this starting frame. 

In [133]:
value_dict["RescaleType"]

defaultdict(int, {'HU': 15800, 'US': 1349})

"US" is for unspecified, so we can consider that everything is in HU.

In [134]:
value_dict["PixelSpacing"]

defaultdict(int,
            {"['0.488281', '0.488281']": 154,
             "['0.490234', '0.490234']": 247,
             "['0.494141', '0.494141']": 313,
             "['0.49804688', '0.49804688']": 156,
             "['0.501953', '0.501953']": 250,
             "['0.505859', '0.505859']": 177,
             "['0.507812', '0.507812']": 320,
             "['0.512', '0.512']": 154,
             "['0.513672', '0.513672']": 251,
             "['0.515625', '0.515625']": 489,
             "['0.519531', '0.519531']": 149,
             "['0.523438', '0.523438']": 110,
             "['0.52734375', '0.52734375']": 454,
             "['0.527344', '0.527344']": 1546,
             "['0.529296875', '0.529296875']": 252,
             "['0.531', '0.531']": 150,
             "['0.533203', '0.533203']": 346,
             "['0.534', '0.534']": 146,
             "['0.535156', '0.535156']": 190,
             "['0.537109', '0.537109']": 274,
             "['0.537109375', '0.537109375']": 217,
             "

Spacing between rows and columns

In [142]:
value_dict["LargestImagePixelValue"]

defaultdict(int,
            {'1808': 1,
             '1848': 1,
             '1876': 1,
             '1885': 1,
             '1891': 1,
             '1894': 2,
             '1895': 1,
             '1898': 1,
             '1901': 1,
             '1903': 298,
             '1906': 1,
             '1913': 312,
             '1915': 1,
             '1916': 1,
             '1920': 1,
             '1924': 1,
             '1929': 1,
             '1933': 344,
             '1935': 1,
             '1936': 1,
             '1937': 1,
             '1940': 4,
             '1946': 1,
             '1948': 1,
             '1956': 1,
             '1958': 1,
             '1961': 1,
             '1963': 1,
             '1964': 1,
             '1965': 3,
             '1969': 336,
             '1970': 1,
             '1971': 1,
             '1975': 1,
             '1976': 1,
             '1977': 338,
             '1979': 2,
             '1980': 1,
             '1981': 300,
             '1982': 2,
           

The maximum actual pixel value encountered in this image.

In [160]:
value_dict["WindowCenterWidthExplanation"]
pprint.pprint(dict(value_dict["WindowWidth"]))
value_dict["WindowCenter"]

{'1000': 3660,
 '1500': 19682,
 '1500.000000': 7811,
 '1600': 4506,
 '1800': 3941,
 '2000': 1022,
 '350': 18423,
 '380': 948,
 '400': 108216,
 '400.0': 130,
 '400.000000': 5512,
 '500': 727,
 "['00360', '00360']": 6218,
 "['01500', '01500']": 507,
 "['01600', '01600']": 6026,
 "['1200', '350']": 2960,
 "['1500', '1200']": 3400,
 "['1500', '1500']": 3378,
 "['1500', '2']": 9023,
 "['1500', '350']": 910,
 "['1500', '400']": 743,
 "['1500.000000', '350.000000']": 4905,
 "['1600', '2']": 954,
 "['1600', '400']": 5202,
 "['1600.000000', '400.000000']": 151,
 "['1800', '1800']": 109,
 "['2000', '1500']": 553,
 "['2000', '3200']": 203,
 "['2000', '350']": 4393,
 "['2000', '400']": 3457,
 "['2500', '340']": 168,
 "['2500', '350']": 1623,
 "['300', '1500']": 1654,
 "['342', '400']": 187,
 "['350', '1500']": 2380,
 "['350', '2000']": 148,
 "['350', '2500']": 12398,
 "['350', '340']": 169,
 "['350', '350']": 827,
 "['350', '36']": 801,
 "['356', '2500']": 90,
 "['360', '1500']": 174,
 "['360', '4

defaultdict(int,
            {'-400': 4942,
             '-500': 5245,
             '-600': 13596,
             '-650': 8547,
             '-650.000000': 7811,
             '-700': 481,
             '20': 6348,
             '35': 1517,
             '40': 118774,
             '40.0': 130,
             '40.000000': 5512,
             '45': 948,
             '50': 727,
             "['-0150', '-0150']": 339,
             "['-0500', '-0500']": 168,
             "['-0600', '-0600']": 6026,
             "['-400', '0']": 954,
             "['-428.000000', '40.000000']": 151,
             "['-500', '-600']": 3400,
             "['-500', '300']": 361,
             "['-500', '40']": 420,
             "['-500', '50']": 2533,
             "['-500.000000', '50.000000']": 4905,
             "['-550', '40']": 2061,
             "['-550', '50']": 3141,
             "['-600', '-600']": 109,
             "['-600', '30']": 3311,
             "['-600', '40']": 4539,
             "['-600', '50']": 2960,
  

Window center and window values are parameters used in order to wathc the image. They define, a certain unit range to exclude value from when the pixels are allready in Hounsfield units. When we have two of them, it means that there are two ways of preprocessing the image. For more details on how to compute a viz from window_center you can checkout this [link](https://www.dabsoft.ch/dicom/3/C.11.2.1.2/). For the double values part, you can check out this [stackoverflow question](http://stackoverflow.com/questions/10088701/dicom-window-center-window-width). WindowCenterWidthExplanation is supposed to be on the same subject.

In [None]:
value_dict["SeriesNumber"]
value_dict["AcquisitionNumber"]
value_dict["StudyInstanceUID"]

Identification data important data from the [dicom model](http://dicomiseasy.blogspot.fr/2011/12/chapter-4-dicom-objects-in-chapter-3.html).

In [145]:
value_dict["PixelPaddingValue"]

defaultdict(int,
            {'\x00\x00': 598,
             '-1024': 6045,
             '-2000': 17920,
             '0': 10299,
             '0\xf8': 52950,
             '63536': 70891,
             '8240': 620})

According to this [link](https://www.dabsoft.ch/dicom/3/C.7.5.1.1.2/), this the value of the pixels outside the actual image. 

In [152]:
print value_dict["HighBit"]
value_dict["BitsStored"]

defaultdict(<type 'int'>, {'11': 116287, '15': 169093})


defaultdict(int, {'12': 116287, '16': 169093})

High Bit (0028,0102) defines how the bits stored are aligned inside the bits allocated. It is the bit number (the first bit is bit 0) of the last bit used. In the standard it is always set as one less then the bits stored but hypothetically it doesn't have to be that way. In our case, the high bit is 7. In CT it is 11. Here's an image from the DICOM standard that shows how pixels are arranged bit-wise.

In [150]:
pprint.pprint(dict(value_dict["RescaleSlope"]))
value_dict["RescaleIntercept"]

{'1': 251930,
 '1.0': 130,
 '1.000000': 29395,
 '1.000244': 2974,
 '1.0002441': 553,
 '2': 398}


defaultdict(int,
            {'-1000': 6533,
             '-1000.000000': 149,
             '-1024': 221577,
             '-1024.0': 130,
             '-1024.000000': 30750,
             '-2048': 481,
             '-2048.000000': 1470,
             '0': 24290})

According to [this blog post](https://blog.kitware.com/dicom-rescale-intercept-rescale-slope-and-itk/), scale and interscept are used in order to be able to store non signed integers. A transformation using them is necessary in order to go back to Hounsfiel units. 

In [None]:
pprint.pprint(dict(value_dict["LossyImageCompression"]))
value_dict["LossyImageCompressionRatio"]

Some images may have suffered lossy compression. This is a recording of that process. Happened to very few images.

In [162]:
value_dict["PixelRepresentation"]

defaultdict(int, {'0': 116287, '1': 169093})

http://dicom.nema.org/medical/dicom/2014c/output/chtml/part03/sect_C.7.6.3.html