# Step 1 Create Files with Dummy data

In [1]:
# required imports
import numpy as np
import fnmatch
import os

# Step 2 Traverse Directories 

In [2]:
 # Here we have intialized some variables, you can add more if required.
file_count = 0 # file_count to count number of files
files_dict = {} # files_dic to store count of every file 
unique_word_set = set() # unique_word_set to store all the unique words i

In [3]:
directory = 'files'
files = os.listdir(directory)
count = 0

for file in files:
    if os.path.isfile(os.path.join(directory, file)):
        file_count += 1 
    with open(os.path.join(directory, file), 'r') as file_obj:
        file_contents = file_obj.read()
        words = file_contents.split()
        
        files_dict[file] = words
print("\nTotal Number of files\n", file_count)
print("\nDictionary containing files\n", files_dict)


Total Number of files
 3

Dictionary containing files
 {'f1.txt': ['This', 'is', 'my', 'book'], 'f2.txt': ['This', 'is', 'my', 'pen'], 'f3.txt': ['My', 'book', 'is', 'intresting']}


# Step 3 Extract Unique Vocabulary

In [4]:
directory = 'files'
files = os.listdir(directory)
file_count=0
data_list = []
for file in files:
    if os.path.isfile(os.path.join(directory, file)):
        file_count += 1 
    with open(os.path.join(directory, file), 'r') as file_obj:
        file_contents = file_obj.read()
        words = file_contents.split()
        for word in words:
            word = word.strip().lower()
            data_list.append(word)

# Access the list
print(data_list)
unique_word_set = set(data_list)
print("\nTotal Number of files\n", file_count)
print('Unique words in File\n', unique_word_set)


# Create a dictionary with list items as keys and unique digits as values
unique_word_set = {item: i+1 for i, item in enumerate(unique_word_set)}

# Print the dictionary

print('Dictionary of unique words', unique_word_set)


['this', 'is', 'my', 'book', 'this', 'is', 'my', 'pen', 'my', 'book', 'is', 'intresting']

Total Number of files
 3
Unique words in File
 {'intresting', 'is', 'pen', 'book', 'this', 'my'}
Dictionary of unique words {'intresting': 1, 'is': 2, 'pen': 3, 'book': 4, 'this': 5, 'my': 6}


# Step 4 Create Term Document Matrix 

In [5]:
term_doc_matrix = np.zeros((len(files_dict), len(unique_word_set)))

print("TERM DOC MATRIX Initially:")
print(term_doc_matrix)

print('Dictionary of Unique Words', unique_word_set)
print('Dictionary of Files', files_dict)

TERM DOC MATRIX Initially:
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Dictionary of Unique Words {'intresting': 1, 'is': 2, 'pen': 3, 'book': 4, 'this': 5, 'my': 6}
Dictionary of Files {'f1.txt': ['This', 'is', 'my', 'book'], 'f2.txt': ['This', 'is', 'my', 'pen'], 'f3.txt': ['My', 'book', 'is', 'intresting']}


# Step 5 Fill Term Document Matrix

In [6]:
directory = 'files'
files = os.listdir(directory)
file_count=0
for file in files:
    if os.path.isfile(os.path.join(directory, file)):
        file_count += 1 
    with open(os.path.join(directory, file), 'r') as file_obj:
        file_contents = file_obj.read()
    words = file_contents.split() 
    print(words)
    for word in words:
        word = word.strip().lower()
        if word in unique_word_set:
            word_index = unique_word_set[word]
            term_doc_matrix[file_count-1][word_index-1] = 1 

print('\nDictionary of Unique Words', unique_word_set)
print("TERM DOC MATRIX after Filling:")
print(term_doc_matrix)

['This', 'is', 'my', 'book']
['This', 'is', 'my', 'pen']
['My', 'book', 'is', 'intresting']

Dictionary of Unique Words {'intresting': 1, 'is': 2, 'pen': 3, 'book': 4, 'this': 5, 'my': 6}
TERM DOC MATRIX after Filling:
[[0. 1. 0. 1. 1. 1.]
 [0. 1. 1. 0. 1. 1.]
 [1. 1. 0. 1. 0. 1.]]


# Step 6 Ask for a user Query

In [7]:
print('\nDictionary of Unique Words', unique_word_set)
col_vector = np.zeros( (len(unique_word_set) , 1 ) )
print('\nColumn Vector Initially: \n', col_vector)
query = input("\nWrite something for searching:  ")
for word in query.split():
    if word in unique_word_set:
        word_index = unique_word_set[word]
        col_vector[word_index-1] += 1
print(col_vector)


Dictionary of Unique Words {'intresting': 1, 'is': 2, 'pen': 3, 'book': 4, 'this': 5, 'my': 6}

Column Vector Initially: 
 [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]

Write something for searching:  thi is
[[0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]


# Step 7 Display Resultant Vector

In [60]:
max = 0
j = 0
index = 0
for i in col_vector:
    j+=1
    if i>max:
        max = i
        index = j

print(col_vector)
print("Maximum of resultant is ", max)
print("Index for maximum in resultant is ", index)

[[1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]]
Maximum of resultant is  [1.]
Index for maximum in resultant is  1


# Step 8 Display the contents of file 