# Building a Robot Judge
## ETH Zurich, Spring 2019

## Problem Set 1: Extra Credit

Author: Philipp Nikolaus

## 1. Environment

In [12]:
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
tqdm.pandas()
import random

import spacy

from sklearn.feature_extraction.text import CountVectorizer

## 2. get_transformation(D_1, D_2)

In [73]:
def get_transformation(D_1: pd.DataFrame, D_2: pd.DataFrame) -> Tuple[List[str], np.ndarray, np.ndarray]:

    ## getting the length of the two input matrices D1 and D2. these will be used as n_row when creating the
    ## transformation matrices.
    n_1 = len(D_1)
    
    ## extracting the features from each matrix and creating a joint dictionary
    cols_1 = list(D_1.columns)
    cols_2 = list(D_2.columns)

    m_1 = len(cols_1)
    m_2 = len(cols_2)

    ## let's define the new feature space
    cols_3 = list(set(cols_1) | set(cols_2))
    cols_3.sort()

    m_3 = len(cols_3)
    
    ## for each of the matrices, creating a list of all features that are missing the matrix compared to the joint
    ## dictionary
    T_1 = np.zeros((m_1, m_3))
    T_1[np.arange(m_1),[cols_3.index(col) for col in cols_1]] = 1

    T_2 = np.zeros((m_2, m_3))
    T_2[np.arange(m_2),[cols_3.index(col) for col in cols_2]] = 1

    return cols_3, T_1, T_2

In [80]:
def transform(D_1: pd.DataFrame, D_2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    cols_3, T_1, T_2 = get_transformation(D_1, D_2)

    D_1_p = pd.DataFrame(
        np.matmul(D_1.values, T_),
        index=D_1.index,
        columns=cols_3
    )
    D_2_p = pd.DataFrame(
        np.matmul(D_2.values, T_2),
        index=D_2.index,
        columns=cols_3
    )

    
    return D_1_p, D_2_p

## 3. cosine_sim(D_1, D_2)

In [85]:
def cosine_sim(D_1, D_2):
    
    D_1_p, D_2_p = transform(D_1, D_2)
    ## converting matrix 1 to a numpy array
    D_1_p = D_1_p.values
    D_1_p_norm = np.linalg.norm(D_1_p, axis=1)
    ## reordering to columns of matrix 2 to match the order of matrix 1 and coverting to a numpy array
    D_2_p = D_2_p.values
    D_2_p_norm = np.linalg.norm(D_2_p, axis=1)
    
    ## the remaining steps comprise the calculation of cosine similarity according to its definition

    cos_sim = np.divide(
        np.matmul(
            D_1_p, D_2_p.transpose()
        ),
        np.matmul(
            D_1_p_norm.reshape(D_1_p_norm.size, 1),
            D_2_p_norm.transpose().reshape(1, D_2_p_norm.size)
        )
    )
    
    return cos_sim

## 4. Example

Based on the data provided for exercise 1.

### 4.1 Data Prep

In [9]:
n_samples = 10

cases = os.listdir('./data/cases')

In [10]:
data = {}

for i, case in tqdm(enumerate(cases)):
    case_id = case[5:].replace('.txt','')
    data[case_id] = {
        "filename": case,
        "year": int(case[:4]),
        "doc_raw": open(os.path.join("./data/cases",case),"r").read()
    }
df = pd.DataFrame.from_dict(data, orient="index")

5762it [00:05, 973.48it/s]


In [13]:
# take a sample of the data
data_1 = {key: data[key] for key in random.sample(data.keys(), k=n_samples)}
data_2 = {key: data[key] for key in random.sample(data.keys(), k=n_samples)}

In [16]:
## definition of additional steps for nlp-pipeline

def replace_num(doc):
    doc = ["NUMBER" if token.like_num else token.text for token in doc]
    doc = ' '.join(doc)
    return nlp.make_doc(doc)

def filter_words(doc):
    doc = [token.text for i,token in enumerate(doc) if not token.is_punct and not token.is_space]
    doc = ' '.join(doc)
    return nlp.make_doc(doc)

In [17]:
## loading of nlp object and addition of new steps (above) to pipeline

nlp = spacy.load('en_core_web_sm')

nlp.add_pipe(replace_num, name='replace_num', first=True)
nlp.add_pipe(filter_words, name='filter_words', after='replace_num')

print(nlp.pipe_names)

['replace_num', 'filter_words', 'tagger', 'parser', 'ner']


In [19]:
## processing of raw documents through nlp pipeline

for case_id, case in data_1.items():
    doc = nlp(case["doc_raw"])
    data_1[case_id]["doc_spacy"] = doc
    data_1[case_id]["doc_clean"] = " ".join([token.lemma_ for token in doc])

for case_id, case in data_2.items():
    doc = nlp(case["doc_raw"])
    data_2[case_id]["doc_spacy"] = doc
    data_2[case_id]["doc_clean"] = " ".join([token.lemma_ for token in doc])

### 4.2. Count Vectorizer

In [20]:
class DenseCountVectorizer(CountVectorizer):

    def transform(self, docs, copy=True):
        X = super().transform(docs, copy=copy)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, docs, y=None):
        X = super().fit_transform(docs, y=y)
        df = pd.DataFrame(X.toarray(), columns=self.get_feature_names())
        return df

In [54]:
cvec = DenseCountVectorizer()
D_1 = cvec.fit_transform([case["doc_clean"] for case in data_1.values()])

cvec = DenseCountVectorizer()
D_2 = cvec.fit_transform([case["doc_clean"] for case in data_2.values()])

D_1.head(10)

Unnamed: 0,0348,10,10th,111,1182,11th,12,1427,15,158,...,writing,written,wrong,wrongfully,wyandotte,year,yet,york,young,zayre
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,5,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,1,0,0,1,0
4,0,0,2,0,1,22,0,0,0,0,...,0,0,0,1,3,0,3,2,0,0
5,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,3,2,0,0
6,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
8,0,0,0,1,0,11,4,0,0,0,...,0,0,1,0,0,14,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,9,0,0,0,0


### 4.3 Test

#### transform()

In [89]:
D_1_p, D_2_p = transform(D_1, D_2)

In [90]:
print("Shape D_1", D_1.shape)
print("Shape D_1_p", D_1_p.shape)
print("")
print("Shape D_2", D_2.shape)
print("Shape D_2_p", D_2_p.shape)

Shape D_1 (10, 3193)
Shape D_1_p (10, 4387)

Shape D_2 (10, 2735)
Shape D_2_p (10, 4387)


In [91]:
D_2.head()

Unnamed: 0,10,10th,11th,120,124a,130,1343,1382,155,1623,...,yale,year,yet,yield,yoder,york,young,yuginovich,zarate,zorach
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,1,0,0
3,0,0,0,0,4,0,0,0,0,0,...,1,1,2,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0


In [92]:
D_2_p.head()

Unnamed: 0,0348,10,10th,111,1182,11th,12,120,124a,130,...,year,yet,yield,yoder,york,young,yuginovich,zarate,zayre,zorach
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### cosine_sim()

In [87]:
cosine_sim(D_1, D_1)

array([[1.        , 0.80615005, 0.88747476, 0.87904256, 0.91196479,
        0.88073571, 0.8812948 , 0.85963222, 0.90178938, 0.90736115],
       [0.80615005, 1.        , 0.80668725, 0.72985478, 0.79937552,
        0.88094688, 0.84298064, 0.85644936, 0.8124702 , 0.8408613 ],
       [0.88747476, 0.80668725, 1.        , 0.87958999, 0.90720917,
        0.88526296, 0.88628643, 0.86783502, 0.88373648, 0.87935021],
       [0.87904256, 0.72985478, 0.87958999, 1.        , 0.90239427,
        0.83029083, 0.82851727, 0.84053359, 0.87304804, 0.85825659],
       [0.91196479, 0.79937552, 0.90720917, 0.90239427, 1.        ,
        0.88818172, 0.89917503, 0.87895169, 0.91414904, 0.91524931],
       [0.88073571, 0.88094688, 0.88526296, 0.83029083, 0.88818172,
        1.        , 0.89897262, 0.90939268, 0.88764448, 0.90718348],
       [0.8812948 , 0.84298064, 0.88628643, 0.82851727, 0.89917503,
        0.89897262, 1.        , 0.87110924, 0.87434   , 0.88429987],
       [0.85963222, 0.85644936, 0.8678350

In [93]:
cosine_sim(D_1, D_2)

array([[0.80705015, 0.90927475, 0.86685766, 0.89963349, 0.8141226 ,
        0.90334921, 0.86383626, 0.91344674, 0.90719254, 0.30516084],
       [0.8382953 , 0.79082616, 0.77364311, 0.85887172, 0.81906004,
        0.82151366, 0.79270251, 0.79553183, 0.83638621, 0.18868023],
       [0.83335604, 0.9006871 , 0.89995914, 0.88977115, 0.82320857,
        0.89906938, 0.88000928, 0.91379224, 0.91735703, 0.32918104],
       [0.78428507, 0.89691061, 0.90904971, 0.83153204, 0.7650858 ,
        0.87293502, 0.87352753, 0.93056351, 0.8889603 , 0.36718728],
       [0.81932258, 0.9230557 , 0.87568419, 0.90335873, 0.82712448,
        0.92604881, 0.89557946, 0.94013639, 0.92479199, 0.32217615],
       [0.87956105, 0.88147108, 0.86698421, 0.92957596, 0.86095904,
        0.8978164 , 0.85870502, 0.8862511 , 0.91368787, 0.22968611],
       [0.82657964, 0.878156  , 0.84505397, 0.90930874, 0.82963134,
        0.91134893, 0.85632544, 0.89798014, 0.90970425, 0.2911166 ],
       [0.89704622, 0.85398916, 0.8741718