# Prerequisites

In [1]:
import re
from typing import List, Dict, Optional

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import plotly.express as px

# Dataset

In [2]:
religion = fetch_20newsgroups(
    subset='train',
    categories=['alt.atheism', 'talk.religion.misc'],
    remove=('headers', 'footers', 'quotes'),
    random_state=32
)

graphics = fetch_20newsgroups(
    subset='train',
    categories=['comp.graphics', 'comp.windows.x'],
    remove=('headers', 'footers', 'quotes'),
    random_state=32
)

raw_data = religion.data[:20] + graphics.data[:20]

In [None]:
for i in raw_data:
    print(i)

# Preprocessing

## Tokenization

In [3]:
def get_token(text: str) -> List[str]:
    
    first_pattern = r'[A-Za-z]{2,}'
    second_pattern = r'W+^[\s+]'
    
    return re.findall(
        first_pattern,
        re.sub(second_pattern, '', text)
    )

def get_vocab(corpus: List[str]) -> Dict[int, str]:
    
    vocab_list = list()
    for doc in corpus:
        vocab_list += get_token(doc)
        
    return {i: vocab for i, vocab in enumerate(vocab_list)}

In [4]:
tokens = get_token(raw_data[13])
tokens

['Are',
 'we',
 'talking',
 'about',
 'me',
 'or',
 'the',
 'majority',
 'of',
 'the',
 'people',
 'that',
 'support',
 'it',
 'Anyway',
 'think',
 'that',
 'revenge',
 'or',
 'fairness',
 'is',
 'why',
 'most',
 'people',
 'are',
 'in',
 'favor',
 'of',
 'the',
 'punishment',
 'If',
 'murderer',
 'is',
 'going',
 'to',
 'be',
 'punished',
 'people',
 'that',
 'think',
 'that',
 'he',
 'should',
 'get',
 'what',
 'he',
 'deserves',
 'Most',
 'people',
 'wouldn',
 'think',
 'it',
 'would',
 'be',
 'fair',
 'for',
 'the',
 'murderer',
 'to',
 'live',
 'while',
 'his',
 'victim',
 'died',
 'Perhaps',
 'you',
 'think',
 'that',
 'it',
 'is',
 'petty',
 'and',
 'pathetic',
 'but',
 'your',
 'views',
 'are',
 'in',
 'the',
 'minority',
 'Where',
 'are',
 'we',
 'required',
 'to',
 'have',
 'compassion',
 'forgiveness',
 'and',
 'sympathy',
 'If',
 'someone',
 'wrongs',
 'me',
 'will',
 'take',
 'great',
 'lengths',
 'to',
 'make',
 'sure',
 'that',
 'his',
 'advantage',
 'is',
 'removed',
 '

In [5]:
vocabulary = get_vocab(raw_data)
vocabulary

{0: 'Did',
 1: 'you',
 2: 'check',
 3: 'with',
 4: 'the',
 5: 'Afghans',
 6: 'before',
 7: 'posting',
 8: 'this',
 9: 'They',
 10: 'might',
 11: 'disagree',
 12: 'EVER',
 13: 'HEAR',
 14: 'OF',
 15: 'BAPTISM',
 16: 'AT',
 17: 'BIRTH',
 18: 'If',
 19: 'that',
 20: 'isn',
 21: 'preying',
 22: 'on',
 23: 'the',
 24: 'young',
 25: 'don',
 26: 'know',
 27: 'what',
 28: 'is',
 29: 'RB',
 30: 'RB',
 31: 'No',
 32: 'that',
 33: 'praying',
 34: 'on',
 35: 'the',
 36: 'young',
 37: 'Preying',
 38: 'on',
 39: 'the',
 40: 'young',
 41: 'comes',
 42: 'RB',
 43: 'later',
 44: 'when',
 45: 'the',
 46: 'bright',
 47: 'eyed',
 48: 'little',
 49: 'altar',
 50: 'boy',
 51: 'finds',
 52: 'out',
 53: 'what',
 54: 'the',
 55: 'RB',
 56: 'priest',
 57: 'really',
 58: 'wears',
 59: 'under',
 60: 'that',
 61: 'chasible',
 62: 'Does',
 63: 'this',
 64: 'statement',
 65: 'further',
 66: 'the',
 67: 'atheist',
 68: 'cause',
 69: 'in',
 70: 'some',
 71: 'way',
 72: 'surely',
 73: 'it',
 74: 'not',
 75: 'intended',

In [None]:
len(vocabulary)

## Metric

### Binary

In [6]:
def vectorize_binary(vocabulary: dict, document: str) -> List[int]:
    
    doc_list = list()
    for index, token in vocabulary.items():
        if token in get_token(document):
            doc_list.append(1)
        else:
            doc_list.append(0)
            
    return doc_list

In [7]:
def get_docTermList(text: List[str], vocabulary: dict) -> list:
    return [vectorize_binary(vocabulary, doc) for doc in text]
    

def get_docTermMatrix(doc_termList: List[str], vocabulary: dict) -> pd.DataFrame:
    return pd.DataFrame(doc_termList, columns=list(vocabulary.values()))

In [None]:
vectorize_binary(vocabulary, raw_data[0])

In [8]:
doc_termList = get_docTermList(raw_data, vocabulary)

In [9]:
doc_termMatrix = get_docTermMatrix(doc_termList, vocabulary)

## Querier

In [41]:
class Querier:
    
    def __init__(
        self, 
        text: List[str], 
        vocabulary: Dict[int, str], 
        doc_termMatrix: pd.DataFrame
    ):
        
        self._dtm = doc_termMatrix

        self.text = text
        self.vocabulary = vocabulary
        self.top: Optional = None
        
    def vectorize(self, query: str) -> List:
        return vectorize_binary(self.vocabulary, query)
    
    def get_distance(self, x: int, y: int) -> float:
        
        distance = 0
        
        if sum(x) == 0:
            return float('NaN')
        
        else:
            for i in range(len(x)):
                distance += (x[i] - y[i]) ** 2
                
            return distance ** (1 / 2)
        
    def ask(self, query: str) -> pd.Series:
        self.query = query
        self.distances = (
            self._dtm
            .apply(
                lambda x: self.get_distance(x, self.vectorize(query)), 
                axis=1
            )
            .sort_values(ascending=True)
        )
        return self.distances
        
    def view(self, index: Optional = None) -> str:
        return self.text[index]
        
    
    def compare_top(self, n: int = 10):
        top_n = self.distances.head(n)
        return [
            (self.view(i), top_n.loc[i]) 
            for i in top_n.index.to_list()
        ]

In [42]:
query = raw_data[32]

In [43]:
Query = Querier(raw_data, vocabulary, doc_termMatrix)

In [44]:
sample1 = Query.ask(query)

In [45]:
px.bar(sample1)

In [46]:
sample_top10 = Query.compare_top(10)

In [60]:
df = pd.DataFrame(sample_top10, columns=['postings', 'distance'])

In [55]:
df

Unnamed: 0,postings,distance
0,"Is the "".3ds"" file format for Autodesk's 3D Animation Studio available?\n\nThanks,\nGary",0.0
1,\nDid you check with the Afghans before posting this? They\nmight disagree.,20.904545
2,"Well, that would depend on how much we wanted the US and how much\nwe wanted the $1, wouldn't it?\n-Ekr\n",32.372828
3,"\n\nJust because the wording is elsewhere does not mean they didn't spend\nmuch time on the wording.\n\n\nPeople can be described as cruel in this way, but punishments cannot.",37.549967
4,\nThe problem was that SunPost411Ld was not defined.,38.587563
5,\n\n\tThe newspaper itself is almost certainly copyrighted in its\nentirety. Newspapers generally employ legal staffs which make sure\nthey get permission to use a copyrighted image or text. Did...,39.331921
6,"# Are there any places in the Bible where the commandment ""Thou \n# shalt not kill"" is specifically applied? That is, where someone \n# refrained from killing because he remembered th...",40.459857
7,: There is a new version of the RTrace ray-tracing package (8.2.0) at\n: asterix.inescn.pt [192.35.246.17] in directory pub/RTrace.\n: Check the README file.\n\ncant seem to reach the site from ov...,41.036569
8,"\n: >EVER HEAR OF\n: >BAPTISM AT BIRTH? If that isn't preying on the young, I don't know what\n: >is...\n: >\n: RB> \n: RB> No, that's praying on the young. Preying on the young comes\n: RB>...",41.677332
9,\nI tend to use XIconifyWindow to achieve this effect... Have you tried that?\n\n- Brad,42.332021


In [54]:
pd.options.display.max_colwidth = 200