### Recommended runtime environment is using [Google Colaboratory](https://colab.research.google.com/)

In [0]:
!pip install -U nltk #adapted for google colab or linux environments

import urllib #URL handling library
import nltk
nltk.download('all') 
# 'all' is all packages
# 'book' is everything used in the NLTK Book
# 'popular' uses most useful/popular packages


In [0]:
textFileURL = 'http://www.gutenberg.org/cache/epub/5200/pg5200.txt'

# load text from URL, for a swift local environment
data = urllib.request.urlopen(textFileURL) #returns object that can be read like a file
book = data.read()
data.close()

# Convert from byteArray to string
book = book.decode('utf-8')

In [0]:
def cleanText(text):
  # split into words/tokens
  from nltk import word_tokenize
  tokens = word_tokenize(text)

  # set to lower case
  tokens = [word.lower() for word in tokens]

  # stem words
  from nltk.stem.porter import PorterStemmer
  porter = PorterStemmer()
  stemmed = [porter.stem(word) for word in tokens]

  # rm puctuation
  import string
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in stemmed]

  # remove non-aplhabetic charachters
  words = [word for word in stripped if word.isalpha()]
  return words

## Creation of our own stop word list using wikipedia as our document databse
Done by calculating the IDF wights for each word. This is dicussed as an approach to find stop words in [this paper](http://terrierteam.dcs.gla.ac.uk/publications/rtlo_DIRpaper.pdf)

Downloaded by the [wikipedia package](https://pypi.org/project/wikipedia/)

Below are the experiments using the entire [Gutenberg dataset](https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html)(at 1.2 GB of text) as our document base for creating a stop word list. The results from this exoeriment are described below

In [0]:
# install the wikipedia package. This is adapter for google colab or any other linux enviroment
!pip install -U wikipedia
import wikipedia

In [0]:
# article search results for the following terms. Can easily search based on random terms
searchTermList = ['norway','south korea','cherry blossom','strawberry','easter','norse mythology','silla era','shakespeare','italian cooking','google','planck constant','higgs boson']
articleNamesTotal = []
for elem in searchTermList:
  articleNames = wikipedia.search(elem)
  articleNamesTotal = articleNamesTotal+articleNames

# add each article to document dictionary
textsList = []
for elem in (articleNamesTotal):
  try:
    # get document text and tokenize it
    text = wikipedia.page(elem).content
    textsList.append(cleanText(text))
  except wikipedia.exceptions.DisambiguationError as e:
    print(elem + " - raised a dismbiguationError as it may refer to multiple pages")

In [6]:
import math
from itertools import chain

# compute bottom IDF words of given percentage of total. This should constitute our stop word list
def computeBottomIDF(textsList, percentage):
  flattenedList = list(chain.from_iterable(textsList))
  
  # create dictionary of word:idf-weight
  scores = {word: idf(word,textsList) for word in set(flattenedList)}
  
  # sort total & take percentage of elements
  sortedWords = sorted(scores.items(), key=lambda x: x[1], reverse = False)
  percentageOf_sortedWords = sortedWords[:math.floor(len(sortedWords)*percentage)]
  
  printWordAndScore(percentageOf_sortedWords, 'IDF')
  return [word[0] for word in percentageOf_sortedWords]

# print word & score to console in handy format
def printWordAndScore(sortedTokenList, scoreType):
  for word, score in sortedTokenList:
    print("  Word: {},\t {}: {}".format(word, scoreType, round(score, 5)).expandtabs(30))
    
# IR elementary functions
def tf(word, text):
  return text.count(word) / len(text)

def n_containing(word, textsList):
    return sum(1 for text in textsList if word in text)

def idf(word, textsList):
    return math.log(len(textsList) / (n_containing(word, textsList)))

def tfidf(word, text, textsList):
    return tf(word, text) * idf(word, textsList)

privateGeneratedStopWordList = computeBottomIDF(textsList, 0.01)

  Word: the,                   IDF: 0.0
  Word: as,                    IDF: 0.0
  Word: in,                    IDF: 0.0
  Word: and,                   IDF: 0.00873
  Word: of,                    IDF: 0.00873
  Word: refer,                 IDF: 0.00873
  Word: a,                     IDF: 0.00873
  Word: to,                    IDF: 0.02643
  Word: for,                   IDF: 0.02643
  Word: is,                    IDF: 0.0354
  Word: it,                    IDF: 0.0354
  Word: with,                  IDF: 0.04445
  Word: by,                    IDF: 0.04445
  Word: also,                  IDF: 0.04445
  Word: which,                 IDF: 0.0628
  Word: that,                  IDF: 0.0628
  Word: from,                  IDF: 0.0721
  Word: are,                   IDF: 0.08149
  Word: on,                    IDF: 0.09097
  Word: an,                    IDF: 0.10054
  Word: thi,                   IDF: 0.10054
  Word: or,                    IDF: 0.1102
  Word: wa,                    IDF: 0.11996
  Word

In [7]:
# text cleaning up until tokenization
words = cleanText(book)

# fetch general english stop word list
from nltk.corpus import stopwords
stop_words_general = stopwords.words('english')
print('length of general stop word list: {}  -  private stop word list: {}'.format(len(stop_words_general),len(privateGeneratedStopWordList)))

# remove stop words from either list
wordsWithoutGeneralStopWords = [word for word in words if not word in stop_words_general]
wordsWithoutPrivateStopWords = [word for word in words if not word in privateGeneratedStopWordList]

length of general stop word list: 179  -  private stop word list: 235


In [0]:
# compute top x TF words of text  
def printTopTF(text, amount):
  # create dict of tf per unique word in text 
  scores = {word: tf(word,text) for word in set(text)}
  
  # sort total & take top <amount> of elements
  sortedWords = sorted(scores.items(), key=lambda x: x[1], reverse = True)
  amountOf_sortedWords = sortedWords[:amount]
  
  # helper function for handy print format
  printWordAndScore(amountOf_sortedWords, 'TF')

In [9]:
# Calculate tf scores for top 50 words from wordsWithoutGeneralStopWords token sets
printTopTF(wordsWithoutGeneralStopWords, 50)

  Word: hi,                    TF: 0.04174
  Word: wa,                    TF: 0.03142
  Word: gregor,                TF: 0.02262
  Word: would,                 TF: 0.01449
  Word: thi,                   TF: 0.011
  Word: room,                  TF: 0.01009
  Word: could,                 TF: 0.00964
  Word: work,                  TF: 0.00865
  Word: even,                  TF: 0.00789
  Word: father,                TF: 0.00774
  Word: sister,                TF: 0.00766
  Word: door,                  TF: 0.00736
  Word: mother,                TF: 0.00683
  Word: project,               TF: 0.00668
  Word: ani,                   TF: 0.00637
  Word: back,                  TF: 0.0063
  Word: time,                  TF: 0.00562
  Word: way,                   TF: 0.00501
  Word: onli,                  TF: 0.00478
  Word: look,                  TF: 0.00463
  Word: one,                   TF: 0.00463
  Word: nt,                    TF: 0.0044
  Word: gutenbergtm,           TF: 0.00433
  Word: open,  

In [10]:
# Calculate tf scores for top 50 words from wordsWithoutPrivateStopWords token sets
printTopTF(wordsWithoutPrivateStopWords, 50)

  Word: gregor,                TF: 0.02739
  Word: she,                   TF: 0.01838
  Word: him,                   TF: 0.01728
  Word: her,                   TF: 0.01719
  Word: room,                  TF: 0.01222
  Word: you,                   TF: 0.01204
  Word: father,                TF: 0.00937
  Word: sister,                TF: 0.00928
  Word: door,                  TF: 0.00891
  Word: mother,                TF: 0.00827
  Word: project,               TF: 0.00809
  Word: himself,               TF: 0.008
  Word: i,                     TF: 0.0079
  Word: back,                  TF: 0.00763
  Word: look,                  TF: 0.00561
  Word: nt,                    TF: 0.00533
  Word: gutenbergtm,           TF: 0.00524
  Word: open,                  TF: 0.00505
  Word: get,                   TF: 0.00478
  Word: just,                  TF: 0.00469
  Word: said,                  TF: 0.00469
  Word: littl,                 TF: 0.0045
  Word: go,                    TF: 0.0045
  Word: want,   

## Experimenting with using the [Gutenberg](https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html) dataset to compute our own stop word list
Done by calculating the IDF wights for each word. This is dicussed in [this paper](http://terrierteam.dcs.gla.ac.uk/publications/rtlo_DIRpaper.pdf)

Following the guide [here](https://colab.research.google.com/notebooks/io.ipynb#scrollTo=KHeruhacFpSU) on how to download a file from google drive into the connected host machine

**Result from downloading the gutenberg dataset is that the 1.2gb dataset is too big for downloading without being disconnected from the colab-host machine. Above are the results using a wikipedia dataset**  

In [0]:
#Autheticate your google user and give google cloud SDK accessrights, which will enable copying file from your drive
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [0]:
# Download the file we just uploaded.
#
# Replace the assignment below with your file ID
# to download a different file.
#
# A file ID looks like: 1uBtlaggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '17WBziFbt9nhAW5iV-yHPHmCfquBPrjJO'

import io
from googleapiclient.http import MediaIoBaseDownload

request = drive_service.files().get_media(fileId=file_id)
downloaded = io.BytesIO()
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, done = downloader.next_chunk()

downloaded.seek(0)
print('Downloaded file contents are: {}'.format(downloaded.read()))

In [0]:
# It is also a higly time efficient way of creating tf-idf weights, by using the sklearn package
# this initializer with dummy functions allows the input to be a list of already tokenized texts in stead of a long string
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  
