# Assignment TF-IDF
Compute Term Frequency and Inverse Document Frequency using the wikipedia dataset. Find the result of TFxITF of term ('labor', 12).

### Parse data and create RDD for terms - (word, article_id)

In [1]:
from pyspark import SparkContext
import re

sc = SparkContext()

def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return [(word.lower(), int(article_id)) for word in words]
    except ValueError as e:
        return []

raw_data = sc.textFile("/data/wiki/en_articles_part/articles-part", 16).flatMap(parse_article)

In [2]:
raw_data.take(3)

[(u'anarchism', 12), (u'anarchism', 12), (u'is', 12)]

### Load StopWords to filter RDD

In [3]:
! head /datasets/stop_words_en.txt

a
about
above
across
after
afterwards
again
against
all
almost


In [4]:
! ls -lh /datasets/

total 4.0K
-rw-rw-r-- 1 jovyan root 1.9K Sep 11  2017 stop_words_en.txt


In [5]:
## load stopwords dictionary - file is small enough 
stopWords_data = sc.textFile('/datasets/stop_words_en.txt').collect()
stopWords_broadcast = sc.broadcast(stopWords_data)

In [7]:
terms = raw_data.filter(lambda term: term[0] not in stopWords_broadcast.value)

In [8]:
terms.take(3)

[(u'anarchism', 12), (u'anarchism', 12), (u'defined', 12)]

### RDD: Term Counts  - ((word, article_id), count)

In [9]:
terms_counts = terms.map(lambda term: (term, 1)).reduceByKey(lambda x, y: x + y)

In [10]:
terms_counts.take(3)

[((u'lifelong', 9002), 1),
 ((u'fulfill', 2085), 1),
 ((u'appalachiosaurus', 1367), 1)]

In [11]:
terms_counts.lookup((u'anarchism', 12))

[149]

### RDD: total number of words in each article - (article_id, no_words)

In [12]:
article_words_counts = terms.map(lambda term: (term[1], 1)).reduceByKey(lambda x, y: x + y)

In [13]:
article_words_counts.take(3)

[(1536, 224), (2560, 1422), (5808, 3020)]

In [14]:
article_words_counts.lookup(12)

[6096]

### Dt - number of documents in the dataset with particular term - (word, no_documents)

In [19]:
Dt = terms.distinct().countByKey()

In [23]:
Dt[u'anarchism']

19

### RDD: TF&IDF - (term, TF * IDF)

In [69]:
import math

In [70]:
# be careful! nested RDD operation is not allowed in spark !!!
# solution 1: broadcast article_words_counts into memory if possible
# solution 2: join two RDDs into one


# solution 1

article_words_counts_broadcast = sc.broadcast({term[0]: term[1] for term in article_words_counts.collect()})

def compute(term_count):
    (word, article_id), count = term_count
    # compute tf, tf(word, article_id) = N_word / N
    TF = float(count) / article_words_counts_broadcast.value[article_id]
    # compute idf, idf(word, article_id) = 1 / log(1 + Dt)
    IDF = 1.0 / math.log(1 + Dt[word])
#     return (word, IDF)
    return ((word, article_id), TF * IDF)

In [71]:
terms_tf_idf = terms_counts.map(compute)

In [72]:
terms_tf_idf.lookup((u'labor', 12))

[0.00035046896210986204]