# M14-Linear Algebra and Text Processing


In [268]:
from nose.tools import assert_almost_equal, assert_true, assert_equal, assert_raises
from numbers import Number

In [269]:

import numpy as np
import locale
import os
import re
import pandas as pd
from textblob import TextBlob

## **Problem 1 (30 points).** Compute the Euclidean norm for each of the following vectors
$$v_1= \begin{bmatrix}5\\-1\\1\\0\\-5\end{bmatrix}$$

In [270]:
from numpy.linalg import norm
import math
v1a = np.array([5,-1,1,0,-5])
norm_v1 = norm(v1a)
norm_v1

7.2111025509279782

In [271]:
assert_almost_equal(norm_v1, 7.2111025509279782)

$$v_2= \begin{bmatrix}2\\ 3\\ 9\\ 4\end{bmatrix}$$

In [272]:
from numpy.linalg import norm
import math
v2a = np.array([2, 3, 9, 4])
norm_v2 = norm(v2a)
norm_v2

10.488088481701515

In [273]:
assert_almost_equal(norm_v2, 10.488088481701515)

$$v_3=\begin{bmatrix}-2, -3, -7, -5\end{bmatrix}^T$$

In [274]:
from numpy.linalg import norm
import math
v3a = np.array([-2, -3, -7, -5])
norm_v3 = norm(v3a)
norm_v3


9.3273790530888157

In [275]:
assert_almost_equal(norm_v3, 9.3273790530888157)

$$v_4=\begin{bmatrix}-10,   6,   8,   0,  -8,   7\end{bmatrix}^T$$

In [276]:
from numpy.linalg import norm
import math
v4a = np.array([-10,   6,   8,   0,  -8,   7])
norm_v4 = norm(v4a)
norm_v4

17.691806012954132

In [277]:
assert_almost_equal(norm_v4, 17.691806012954132)

$$v_5=\begin{bmatrix}6,  -5,  -7, -10,  -6,  -2,  -2\end{bmatrix}^T$$

In [278]:
from numpy.linalg import norm
import math
v5a = np.array([6,  -5,  -7, -10,  -6,  -2,  -2])
norm_v5 = norm(v5a)
norm_v5

15.937377450509228

In [279]:
assert_almost_equal(norm_v5, 15.937377450509228)

$$v_6= \begin{bmatrix}4\\  2\\ -8\end{bmatrix}$$

In [280]:
from numpy.linalg import norm
import math
v6a = np.array([4, 2, -8])
norm_v6 = norm(v6a)
norm_v6

9.1651513899116797

In [281]:
assert_almost_equal(norm_v6, 9.1651513899116797)

## Problem #2 (10 points)

One of the limitations of word vectors as we have pictured them is [sparsity](https://en.wikipedia.org/wiki/Sparse_array): while our vocabulary is large (tens or hundreds of thousands of words), a typical document (e.g. radiology report) only has tens or hundreds of unique words. Write a class (`sparsev`) that inherits from a `defaultdict` to represent a "sparse" vector. The keys would be the indicies to a vector and the values would be the word counts (how many times that word occurred in the document). The class should have an attribute `self.__dim` that indicates the dimension of the vector space (e.g. the dimension of the vocabulary). The class should have a property `dim` that returns the value in `__dim` the instance of `sparsev` represents. You should define the following methods for the class:

1. `norm`: Accepts as an argument a number `p` (default value=2) and computes the [p-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm) of the vector.
    1. If `p` is not a number, raise a `ValueError`.
    1. If $p \le 0$, raise a `ValueError`.
1. `cosine_sim`: Accepts as an argument an instance of a `sparsev`.
    1. If the two `sparsev` instances do not have the same `dim` raise a ValueError
    1. If you get a `ZeroDivisionError`, return `np.nan`
    1. Note that the range of `arccos` is -1 to 1. Because of floating point arithmetic issues, you may have arguments that should be equal to 1 but are slightly larger. This will result in `arccos` returning a `np.nan`. In these cases, return a 0 (zero).
1. a `__str__` method that shows the dimension of the vector as well as the elements (key/value pairs).

In [282]:
from collections import defaultdict
class sparsev(defaultdict):
    def __init__(self, *args, dim=1, **kwargs):
        self.__dim = dim
        super(sparsev, self).__init__(*args, **kwargs)

        
        
    @property
    def dim(self):
        return self.__dim
    def norm(self, p=2):
        if not isinstance(p, Number):
            raise TypeError("p must be a number")
        if p < 0:
            raise ValueError("p must be greater than zero")
        if p == np.inf:
            return max([math.abs(vv) for vv in self.values()])
        else:
            return math.pow(np.sum([math.pow(vv,p) for vv in self.values()]),1/p)
    def inner(self, v2):
        if self.dim != v2.dim:
            raise ValueError("%d !=%d: the sparse vectors must have the same dimension"%(self.dim, v2.dim))
        keys = set(self.keys()).intersection(v2.keys())
        return np.sum([self[k]*v2[k] for k in keys])
    def cosine_sim(self, v2):
        try:
            v = np.arccos(self.inner(v2)/(self.norm()*v2.norm()))
            if np.isnan(v):
                return 0
            else:
                return v
        except ZeroDivisionError:
            return np.nan
    def __str__(self):
        return "(dim=%d) %s"%(self.dim, super(sparsev, self).__str__())


In [283]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_raises(ValueError, tmp1.inner, tmp2)

In [284]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Helios"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 0)

In [285]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Brian"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 6)


In [286]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("5" in tmp1.__str__())

In [287]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("3" in tmp2.__str__())

## Test on MIMIC2 radiology reports

In [288]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob

In [289]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd="jovyan",db='mimic2')
cursor = conn.cursor()

### Get some documents. Limit the query to keep corpus small for debugging

In [290]:
rad_data = \
pd.read_sql("""SELECT DISTINCT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' 
               LIMIT 20000""",conn)
rad_data.head(5)

Unnamed: 0,subject_id,hadm_id,text
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...
3,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...
4,37,18052.0,\n\n\n DATE: [**3264-8-14**] 6:06 AM\n ...


## Create a vocabulary

We are first going to replace all digits in the reports with a "d" and convert all characters to lower case. This reduces our vocabulary size by approximately half. We will then use `TextBlob` and sets to get all the unique words in our document. This is our vocabulary. The vocabulary is represented as a dictionary which we create with the `zip` function.

In [291]:
reports = re.sub("\d", "d", " ".join([r.lower() for r in rad_data["text"]]))

words = set(TextBlob(reports).words)

vocabulary = dict(zip(words,range(len(words))))
print(len(vocabulary))

24727


In [292]:
list(vocabulary.items())[:10]

[('cardiac/respiratory', 0),
 ('holds', 1),
 ('pulm', 2),
 ('transfustion', 3),
 ('repositin', 4),
 ('ivc/ra', 5515),
 ('bronchogram', 4480),
 ('saturat', 12452),
 ('unerupted', 12453),
 ('motion-degraded', 4105)]

## Problem 3 (20 points):

Write a function `doc2vec` that takes two arguments: 1) `txt` (a text to convert to a vector) and 2) `voc` (the vocabulary). It returns a `sparsev` instance that is the representation of `txt` in the `voc` vector space. Because `txt` may contain words that are not in the vocabulary, you will need to do exception handling.

In [293]:
def doc2vec(txt, voc):
    
    blob = TextBlob(txt.lower())
    v = sparsev(int, dim=len(voc))
    for w in blob.words:
        try:
            v[voc[w]] += 1
        except KeyError:
            pass
    return v


In [294]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.norm(), 66.59579566308972)

In [295]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v157.norm(), 95.40440241414439)

### Cosine similarity of a document with itself should be 1

In [296]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v50), 0)



In [258]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v157), 0.7430023871608521)

### Create a column in `rad_data` that has values equal to the similarity between the reports and `v50`.

In [259]:
rad_data["50sim"] = rad_data.apply(lambda r: v50.cosine_sim(doc2vec(re.sub("\d", "d", r["text"]), vocabulary)), axis=1)



## Problem 4 (10 points):

Create a DataFrame `rad_data2` that only contains rows where rad_data["50sim"] is greater than zero. Sort the `rad_data2` by ascending values of `50sim`.

In [266]:
rad_data2 = None

rad_data2 = rad_data.sort_values(by="50sim")[rad_data["50sim"] > 0]

rad_data2.tail()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,subject_id,hadm_id,text,50sim
2850,1427,12755.0,\n\n\n DATE: [**3422-10-15**] 2:51 PM\n ...,1.427111
9382,5328,28535.0,\n\n\n DATE: [**2695-5-2**] 9:32 PM\n ...,1.427213
12884,7557,,\n\n\n DATE: [**2866-6-7**] 8:13 AM\n ...,1.449436
3629,1884,17763.0,\n\n\n DATE: [**3256-11-8**] 7:37 AM\n ...,1.450346
17501,10315,22270.0,\n\n\n DATE: [**2558-2-12**] 1:09 AM\n ...,1.469616


In [263]:
assert_equal(rad_data2.iloc[0]["subject_id"], 463)

In [267]:
assert_equal(rad_data2.iloc[-1]["subject_id"], 10315)

## What do the most similar and most dissimilar (relative to 50) reports look like?

In [226]:
print(rad_data.iloc[50]["text"])




     DATE: [**3353-1-26**] 5:37 PM
     CT CHEST W/CONTRAST; CT 150CC NONIONIC CONTRAST                 Clip # [**Clip Number (Radiology) 1672**]
     Reason: please eval new RLL inflitrate seen on CXR and r/o empyema  
     Admitting Diagnosis: NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA
     Field of view: 34 Contrast: OPTIRAY Amt: 100
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
        55 year old man with enteropathy associated t cell lymphoma and neutropenic.  
       Tachycardic, tachypneic, resp. alkalosis on abg.
     REASON FOR THIS EXAMINATION:
      please eval new RLL inflitrate seen on CXR and r/o empyema                      
     No contraindications for IV contrast
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATION:  Enteropathy associated T-cell lymphoma and neutropenia,
     tachycardic and tachy

In [264]:
print(rad_data2.iloc[0]["text"])




     DATE: [**3334-11-8**] 2:43 PM
     CT ABD W&W/O C; CT CHEST W/CONTRAST                             Clip # [**Clip Number (Radiology) 3604**]
     CT PELVIS W/CONTRAST; CT 150CC NONIONIC CONTRAST
     Reason: evaluate disease progression
     Field of view: 32 Contrast: OPTIRAY Amt: 150
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      62 year old F with lung cancer. CT chest and abd prior to treatment
     REASON FOR THIS EXAMINATION:
      evaluate disease progression
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATION:  Lung CA.
     
     TECHNIQUE:  Non-contrast images of the abdomen were obtained.  Contrast
     enhanced images of the abdomen, as well as delayed images of the chest,
     abdomen and pelvis following administration of 150 cc of Optiray were
     performed.  Nonionic IV contrast was used 

In [265]:

print(rad_data2.iloc[-1]["text"])




     DATE: [**2558-2-12**] 1:09 AM
     IVC GRAM/FILTER                                                 Clip # [**Clip Number (Radiology) 3326**]
     Reason: POST SUCTION EMBO
      Contrast: OPTIRAY Amt: 30
     ********************************* CPT Codes ********************************
     * 37620 INTERUP IVC                    36010 INTRO CATH SVC/IVC            *
     * -51 MULTI-PROCEDURE SAME DAY         75940 PERC PLCMT IVC FILTER         *
     * 75825 IVC GRAM                       C1880 VENA CAVA FILTER              *
     ****************************************************************************
     ______________________________________________________________________________
                                     FINAL REPORT
     please see clip [**Clip Number (Radiology) 3321**].

             DR. [**First Name4 (NamePattern1) **] [**Last Name (NamePattern1) **]
             DR. [**First Name11 (Name Pattern1) **] [**Initial (NamePattern1) **]. [**Last Name (NameP