In [1]:
import numpy as np
import sklearn
import matplotlib as plt
import pandas as pd

In [2]:
import urllib
import zipfile
import os
import glob
from collections import Counter

# **Extraction and Loading**

In [3]:
url = "http://archives.textfiles.com/stories.zip"
extract_dir = "text_files"

zip_path, _ = urllib.request.urlretrieve(url)
with zipfile.ZipFile(zip_path, "r") as f:
    f.extractall(extract_dir)

In [4]:
path = os.path.join(os.getcwd(),extract_dir)
extracted_files_path = os.path.join(path,'stories')
os.listdir(extracted_files_path)[0:10]

['100west.txt',
 '13chil.txt',
 '14.lws',
 '16.lws',
 '17.lws',
 '18.lws',
 '19.lws',
 '20.lws',
 '3gables.txt',
 '3lpigs.txt']

In [5]:
text_files = glob.glob(extracted_files_path + '/*.txt')
text_files[:10]


['C:\\Users\\Nishankur\\text_files\\stories\\100west.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\13chil.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\3gables.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\3lpigs.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\3student.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\3wishes.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\4moons.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\5orange.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\6ablemen.txt',
 'C:\\Users\\Nishankur\\text_files\\stories\\6napolen.txt']

In [6]:
def read_file(path):
    file_ = open(r'{}'.format(path),"r",encoding='utf8', errors='ignore')
    text = file_.read()
    file_.close()
    return text


In [7]:
file_book = read_file("C:\\Users\\Nishankur\\text_files\\stories\\bookem.1")
file_book

'                          B O O K   \'E M\n\n                      Volume one     Number 1\n                                   \n    Author: Caroline Kent   e-mail: caro@freenet.fsu.edu    \n\n\n   Copyright (c) 1995 by Caroline Kent.  All Rights Reserved.\n\n\n     "Hi everyone!  Welcome to the premiere issue of "Book \'Em,"\nan informal e-zine that is written especially for bookstore\nlovers.  I have been a bookseller for almost four years and I\'d\nlike to share some of my thoughts and experiences with you."\n     (I glance at my watch) \n     "If I don\'t hurry, I\'m going to be late for work.  Why don\'t\nyou come along with me to "Book \'Em" and we\'ll chat some more?"  \n     (I drive to the store with a caravan of cars following me. \nUpon arriving, I lead everyone through the front door. A large\nstack of boxes is being unloaded from a truck) \n     "A lot of you probably think that a bookstore is a quiet,\ndull place where the most exciting event of the year is the\ndelivery

In [8]:
import bs4
import sys
import re
sys.setrecursionlimit(10000)
file_html = read_file("C:\\Users\\Nishankur\\text_files\\stories\\index.html")
soup = bs4.BeautifulSoup(file_html,'lxml')

In [9]:
rows = soup.find_all('tr')

In [10]:
rows[:2]

[<tr valign="TOP"><td valign="TOP"><b><a href="FARNON">FARNON</a></b><tab to="T"><td width="20"></td><td><b>The Stories of Tristan Farnon</b></td></tab></td></tr>,
 <tr valign="TOP"><td valign="TOP"><b><a href="SRE">SRE</a></b><tab to="T"><td width="20"></td><td><b>The Solar Realms Elite, by Josh Renaud</b></td></tab></td></tr>]

In [11]:
contents = os.walk(path)
contents = list(contents)

In [12]:
data = []
for i in contents[1:]:
    file = open(i[0]+"/index.html", 'r')
    text = file.read().strip()
    file.close()
    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)
    if file_name != []:
        if file_name[0]== 'FARNON' :
            file_name = file_name[2:len(file_name)]

    for j in range(len(file_title)):
          data.append((str(i[0]) + '/' + str(file_name[j]), file_title[j]))
      

In [13]:
data[:5]

[('C:\\Users\\Nishankur\\text_files\\stories/100west.txt',
  'Going 100 West by 53 North by Jim Prentice (1990)'),
 ('C:\\Users\\Nishankur\\text_files\\stories/13chil.txt',
  'The Story of the Sly Fox'),
 ('C:\\Users\\Nishankur\\text_files\\stories/14.lws',
  'A Smart Bomb with a Language Parser'),
 ('C:\\Users\\Nishankur\\text_files\\stories/16.lws',
  'Two Guys in a Garage, by M. Pshota'),
 ('C:\\Users\\Nishankur\\text_files\\stories/17.lws',
  'The Early Days of a High-Tech Start-up are Magic (November 18, 1991) by M. Peshota')]

# **Preprocessing**


In [14]:
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nishankur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nishankur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
!pip install num2words
from num2words import num2words 



In [16]:
story_doc = [read_file(name) for name,title in data]

In [17]:
story_title = [title for name,title in data]

In [18]:
print(len(story_doc)==len(data))
print(len(story_doc)==len(story_title))

True
True


In [19]:
#Functions for preprocessing
punctuation = string.punctuation
stemmer = PorterStemmer()

def lower_casing(data):
      return np.char.lower(data)

def remove_punct(data):
    for i in punctuation:
        data = np.char.replace(data, i , " ")  
    return data

def remove_stopwords(data):
    valid_words = ''
    processed_story = []
    for story in data:
        words = story.split()
        for word in words:
            if word not in stopwords.words('english'):
                valid_words = valid_words +' '+ word
        processed_story.append(valid_words)
        valid_words = ''
    return processed_story

def remove_singles(data):
    valid_words = ''
    processed_story = []
    for story in data:
        words = story.split()
        for word in words:
            if len(word)>1:
                valid_words = valid_words +' '+ word
        processed_story.append(valid_words)
        valid_words = ''
    return processed_story

def numwords(data):
    valid_words = ''
    processed_story = []
    for story in data:
        words = story.split()
        for word in words:
            try:
                numword = num2words(word)
                numword = np.char.replace(numword, "-", " ")
                valid_words = valid_words +' '+ numword
            except:
                valid_words = valid_words +' '+ word
        processed_story.append(valid_words)
        valid_words = ''
    return processed_story

def stemwords(data):
    valid_words = ''
    processed_story = []
    for story in data:
        words = story.split()
        for word in words:
            word = stemmer.stem(word)
            valid_words = valid_words +' '+ word
        processed_story.append(valid_words)
        valid_words = ''
    return processed_story
  
def preprocessor(data):
    data = lower_casing(data)
    data = remove_punct(data)
    data = numwords(data)
    data = remove_stopwords(data)
    data = remove_singles(data)
    data = stemwords(data)
    data = remove_singles(data)
    data = numwords(data)
    data = remove_stopwords(data)
  
    return data

In [20]:
processed_title = preprocessor(story_title)
processed_title_tokens = [word_tokenize(title) for title in processed_title]

In [21]:
processed_story = preprocessor(story_doc)
processed_story_tokens = [word_tokenize(story) for story in processed_story]

In [22]:
len(processed_story) == len(processed_title) 

True

In [23]:
N = len(processed_story)
len(processed_story_tokens) == len(processed_title_tokens) 

True

In [24]:
processed_story_tokens[0][:10]

['sharewar',
 'trial',
 'project',
 'freewar',
 'need',
 'support',
 'continu',
 '100',
 'west',
 '53']

In [25]:
DF = {}
for i in range(N):
    for w in processed_story_tokens[i]:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
    for w in processed_title_tokens[i]:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

In [26]:
c = 0
for key ,value in DF.items():
    DF[key] = len(DF[key])

for key ,value in DF.items():
    c += 1
    print(f"{key} : {value}")
    if c==10:
        break

sharewar : 5
trial : 35
project : 63
freewar : 1
need : 243
support : 87
continu : 193
100 : 38
west : 65
53 : 12


In [27]:
total_unique_words = list(DF.keys())
len(total_unique_words)

33221

In [28]:
#tf-idf for body
tf_idf_body = {}
for i in range(N):
    tokens = processed_story_tokens[i]
    title = processed_title_tokens[i]
    counter = Counter(title + tokens)
    n = len(tokens+title)
    for token in np.unique(tokens):
        tf = counter[token]/n
        try:
            df = DF[token]/N
        except:
            df = 0
        idf = np.log(N/(df+1))
        tf_idf_body[i, token] = tf*idf

In [29]:
#tf-idf for title
tf_idf_title = {}
for i in range(N):
    tokens = processed_title_tokens[i]
    body = processed_story_tokens[i]
    counter = Counter(body + tokens)
    n = len(tokens+body)
    for token in np.unique(tokens):
        tf = counter[token]/n
        try:
            df = DF[token]/N
        except:
            df = 0
        idf = np.log(N/(df+1))
        tf_idf_title[i, token] = tf*idf

In [30]:
len(tf_idf_body)

341714

In [31]:
alpha = 0.3
for key ,value in tf_idf_body.items():
    tf_idf_body[key] = value*alpha


In [32]:
for key ,value in tf_idf_title.items():
    tf_idf_body[key] = value

In [33]:
len(tf_idf_body)

342003

In [34]:
tf_idf = tf_idf_body

#**Matching Scores**

In [35]:
def matches(text):
    query_score = {}
    text = [text]
    processed_text = preprocessor(text)
    processed_tokens = word_tokenize(processed_text[0])
    for key in tf_idf:
        if key[1] in processed_tokens:
            try:
                query_score[key[0]] += tf_idf[key]
            except:
                query_score[key[0]] = tf_idf[key]
    query_score = sorted(query_score.items(),key = lambda x:x[1],reverse=True)
    return query_score


In [36]:
query1 = "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying"

In [38]:
all_matches1 = matches(query1)
len(all_matches1)

403

In [39]:
def file_name(all_matches):
    print("Top ten matches:")
    for i in range(10):
        value = all_matches[i][0]
        print(data[value][0])

file_name(all_matches1)

Top ten matches:
C:\Users\Nishankur\text_files\stories/fea3
C:\Users\Nishankur\text_files\stories/ghost
C:\Users\Nishankur\text_files\stories/quarter.c6
C:\Users\Nishankur\text_files\stories/quarter.c4
C:\Users\Nishankur\text_files\stories/foxnstrk.txt
C:\Users\Nishankur\text_files\stories/quarter.c15
C:\Users\Nishankur\text_files\stories/narciss.txt
C:\Users\Nishankur\text_files\stories/foxngrap.txt
C:\Users\Nishankur\text_files\stories/vday.hum
C:\Users\Nishankur\text_files\stories/quarter.c9


In [40]:
read_file("C:\\Users\\Nishankur\\text_files\\stories\\quarter.c9")

'THE ROSE\nby Suzy Edmonson\n\n     "How beautiful!  A rose!"\n     She quickly filled a vase with water and set the vase on the living room\ntable.  She sat for several minutes gazing at the beauty of the delicate\nblossom.  Then the phone rang and the rose was forgotten.  The next day, she\nstopped to admire the perfection of the flower but then ran out the door heading\nfor school.  When the girl arrived home that night, she filled the vase with\nfresh water and deeply inhaled the rose\'s flowery scent.  Then she retreated to\nher room to begin her homework.  She woke up late the next morning and ran out\nthe door with only a quick glance at the rose.  Night came and the girl didn\'t\ncome home.  At one o\'clock the girl tiptoed into the house, ran into her room\nand fell asleep on her bed.  For the rest of the week the girl went about her\ndaily routines forgetting the rose on the table.  It was not so beautiful\nanymore.  The stem sagged, the blossom had lost its vibrant color and

#**Document Vector creation and macthing**

In [41]:
#document vectors
doc_vec = np.zeros((N,len(total_unique_words)))
for (doc,word),value in tf_idf.items():
    try:
        i = total_unique_words.index(word)
        doc_vec[doc][i]= value
    except:
        pass
        

In [42]:
#Query vector
def query_vectorizer(text):
    query_vec = np.zeros(len(total_unique_words))
    text = [text]
    processed_text = preprocessor(text)
    processed_tokens = word_tokenize(processed_text[0])
    counter = Counter(processed_tokens)
    tot_words = len(processed_tokens)
    for word in processed_tokens:
        tf = counter[word]/tot_words
        try:
            df = DF[word]/N
        except:
            df = 0
        idf = np.log(N/(df+1))
        try:
            i = total_unique_words.index(word)
            query_vec[i]= tf*idf
        except:
            pass
    return query_vec

In [45]:
len(query_vectorizer(query1))

33221

In [46]:
#Similarity calculator
def cosine_sim(a,norm_a,doc):
    b = doc
    cos_sim = np.dot(a, b)/(norm_a*np.linalg.norm(b))
    return cos_sim

In [47]:
def top_20(query):
    query_vec = query_vectorizer(query)
    norm_a = np.linalg.norm(query_vec)
    query_score = {}
    n_doc = len(doc_vec)
    for i in range(n_doc):
        query_score[i] = cosine_sim(query_vec,norm_a,doc_vec[i]) 
    query_score = sorted(query_score.items(),key = lambda x:x[1],reverse=True)
    return query_score[:20]
        
        
    
    
    

In [None]:
top_20(query1)

In [None]:
query2 = 'To make matters worse, up raced a large dog, snarling viciously. The fox \ndropped the hen and tried to jump out of the hen run. At the first try, he \nfell back, perhaps weak with fright. He could almost feel the dog\'s fangs sink\ninto his ear, but with a desperate jump, he got over the fence.'

In [112]:
all_matches2 = top_20(query2)

In [113]:
file_name(all_matches2)

Top ten matches:
C:\Users\Nishankur\text_files\stories/foxngrap.txt
C:\Users\Nishankur\text_files\stories/greedog.txt
C:\Users\Nishankur\text_files\stories/korea.s
C:\Users\Nishankur\text_files\stories/aesopa10.txt
C:\Users\Nishankur\text_files\stories/friends.txt
C:\Users\Nishankur\text_files\stories/fran
C:\Users\Nishankur\text_files\stories/running.txt
C:\Users\Nishankur\text_files\stories/foxnstrk.txt
C:\Users\Nishankur\text_files\stories/friend.s
C:\Users\Nishankur\text_files\stories/13chil.txt


In [118]:
query3 = "hen she retreated to\nher room to begin her homework.  She woke up late the next morning and ran out\nthe door with only a quick glance at the rose.  Night came and the girl didn\'t\ncome home."

In [119]:
all_matches3 = top_20(query3)

In [120]:
file_name(all_matches3)

Top ten matches:
C:\Users\Nishankur\text_files\stories/quarter.c9
C:\Users\Nishankur\text_files\stories/girlclub.txt
C:\Users\Nishankur\text_files\stories/lpeargrl.txt
C:\Users\Nishankur\text_files\stories/roger1.txt
C:\Users\Nishankur\text_files\stories/running.txt
C:\Users\Nishankur\text_files\stories/enginer.txt
C:\Users\Nishankur\text_files\stories/bulmrx.txt
C:\Users\Nishankur\text_files\stories/girl
C:\Users\Nishankur\text_files\stories/3student.txt
C:\Users\Nishankur\text_files\stories/wlgirl.txt
