In [1]:
import math 
import pandas as pd
from util.util import removeTags, cleanText, textStemming, textLemmatization, list_directories


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Taj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
class Search():
    
    '''
    Provides Quick Search over thousands of files, by filename and content
    Returns relevant documents to the keywords
    can be thought as topic relevance assesment measure.
    
    '''
    
    def __init__(self,keywords,path):
        self.keywords = keywords
        self.path = path
        
    def prepare_query(self, keywords):

        query = {}
        keys = keywords.split()
        query = {keys[i]: i for i in range(0, len(keys), 1)} 
    
        return [query,keys]

    # we calculate the frequecny of the query words in each documents and return them as a vecto
    def words2vector(self, query,keys,text):
     
        vector = [0] * len(keys)
        for word in text:
            if word in query:
                vector[query[word]] = vector[query[word]] + 1
        return vector
    
    def length(self, vector):
    
        if sum(vector) != 0:
    
            sq_length = 0
            for index in range(0, len(vector)):
                sq_length += math.pow(vector[index], 2)
            return math.sqrt(sq_length)
        else: 
            return 1
   
    def dot_product(self, vector1, vector2):
        if len(vector1)==len(vector2):
            dot_prod = 0
            for index in range(0, len(vector1)):
                dot_prod += vector1[index]*vector2[index]
            return dot_prod
        else:
            return "Unmatching dimensionality"
        
    def read_txt_files(self, path):
        
        dictionary = {} # key: file name, value: text        
        paths = list_directories(path)
        
        for f in paths:
            
            file = open(f'{path}/{f}','r+').read()
            
            if f not in dictionary:
                dictionary[f] = file
                
        return dictionary

    def read_txt_files_stem_lemm(self, path):
        
        dictionary = {} # key: file name, value: text        
        paths = list_directories(path)
        
        for f in paths:
            
            file = open(f'{path}/{f}','r+').read()
            file = textStemming(file); file = textLemmatization(file)
            
            if f not in dictionary:
                dictionary[f] = file
                
        return dictionary
    
    def read_csv_files(self,path):
        
        dictionary = {}   
        paths = list_directories(path)
        
        for f in paths:
            df = pd.read_csv(f"{path}/{f}") 
            for index,row in df.iterrows():
                
                dictionary[str(row[1])] = row[0]
        return dictionary
                
                
        
            
    def search_by_filename(self,query, keys, doc1,dictionary):

        results = {}
        for k,v in dictionary.items():
            
            temp_k = k.lower().split('.')[0].split('_')[1:]
            doc2 = self.words2vector(query, keys,temp_k) ## search by text
            product = self.dot_product(doc1, doc2)
            leng = (self.length(doc1)*self.length(doc2))
            cosine_sim = product/leng
            if cosine_sim > 0.7:
                results[k] = cosine_sim
        return results
                

    def search_by_content(self,query, keys, doc1,dictionary):
        
        results = {}
        for k,v in dictionary.items():
            
            doc2 = self.words2vector(query, keys,v.lower().split()) ## search by text
            product = self.dot_product(doc1, doc2)
            leng = (self.length(doc1)*self.length(doc2))
            cosine_sim = product/leng
            if cosine_sim > 0.7:
                results[k] = cosine_sim
        return results
    
    def search(self,stem_lemm = False, txt=True, csv=False):
        
        if stem_lemm == False and txt == True:
            dictionary = self.read_txt_files(self.path)
        elif stem_lemm == True and txt == True:
            dictionary = self.read_txt_files_stem_lemm(self.path)
            
        elif stem_lemm == False and csv == True:
            dictionary = self.read_csv_files(self.path)
#         elif stem_lemm == True and txt == True:
#             dictionary = self.read_txt_files_stem_lemm(self.path)        
    
    
        print(f'Total Files Number: {len(dictionary)}')
        query, keys = self.prepare_query(keywords)
        doc1 = self.words2vector(query, keys,keys)
        
        r1 = self.search_by_content(query, keys, doc1,dictionary)
        r2 = self.search_by_filename(query, keys, doc1,dictionary)
        
        return ([r1,r2])
        
        

            

In [14]:
keywords = 'turkish airlines'

search = Search(keywords,'datasets/search_files')
result = search.search()
# result = search.search(stem_lemm = True)

# search = Search(keywords,'datasets/csv')
# result = search.search(txt=False, csv=True)

print('\nSearched By content:\n')
print(f'Number Of Matches: {len(result[0])}')
print(result[0],'\n------------------------------\n')

print('Searched By Filename:\n')
print(f'Number Of Matches: {len(result[1])}')
print(result[1],'\n\n')

Total Files Number: 2233

Searched By content:

Number Of Matches: 39
{'1122_business.txt': 0.7071067811865475, '1123_business.txt': 0.7071067811865475, '1235_business.txt': 0.7071067811865475, '1250_business.txt': 0.7071067811865475, '1286_politics.txt': 0.7071067811865475, '1384_business.txt': 0.7071067811865475, '1460_tech.txt': 0.7071067811865475, '1495_business.txt': 0.7071067811865475, '1502_business.txt': 0.7071067811865475, '1584_politics.txt': 0.7071067811865475, '1662_business.txt': 0.7071067811865475, '1758_business.txt': 0.7071067811865475, '1769_business.txt': 0.7071067811865475, '1801_business.txt': 0.7071067811865475, '181_tech.txt': 0.7071067811865475, '1823_business.txt': 0.7071067811865475, '1896_business.txt': 0.7071067811865475, '1947_business.txt': 0.7071067811865475, '1949_business.txt': 0.7071067811865475, '1963_business.txt': 0.7071067811865475, '1965_sport.txt': 0.7071067811865475, '2057_business.txt': 0.7071067811865475, '2061_politics.txt': 0.7071067811865475

In [21]:
keywords = 'call of duty'

search = Search(keywords,'datasets/search_files')
result = search.search()
# result = search.search(stem_lemm = True)

# search = Search(keywords,'datasets/csv')
# result = search.search(txt=False, csv=True)

print('\nSearched By content:\n')
print(f'Number Of Matches: {len(result[0])}')
print(result[0],'\n------------------------------\n')

print('Searched By Filename:\n')
print(f'Number Of Matches: {len(result[1])}')
print(result[1],'\n\n')

Total Files Number: 2233

Searched By content:

Number Of Matches: 38
{'1049_tech.txt': 0.7433111162394346, '1118_sport.txt': 0.7001400420140049, '1223_sport.txt': 0.7745966692414834, '1277_sport.txt': 0.7745966692414834, '1312_politics.txt': 0.7001400420140049, '1337_tech.txt': 0.7328457798629513, '1360_business.txt': 0.8154778698873298, '1425_sport.txt': 0.8164965809277261, '1443_politics.txt': 0.7745966692414834, '1545_politics.txt': 0.7921180343813395, '1588_politics.txt': 0.7003009752518536, '1633_sport.txt': 0.7745966692414834, '1636_sport.txt': 0.7745966692414834, '1702_business.txt': 0.7001400420140049, '1754_tech.txt': 0.762073496288714, '177_sport.txt': 0.8164965809277259, '1829_business.txt': 0.7001400420140049, '1965_sport.txt': 0.7745966692414834, '2024_politics.txt': 0.7302967433402215, '2096_sport.txt': 0.8164965809277259, '2134_sport.txt': 0.7745966692414834, '21_tech.txt': 0.7504787743864565, '265_sport.txt': 0.7745966692414834, '286_business.txt': 0.7001400420140049, 

In [None]:
# # Search data Generation

# # search files generation
# df = pd.read_csv("datasets/bbc-text.csv") 
# for index,row in df.iterrows():
#     file = open(f'datasets/search_files/{index}_{row[0]}.txt','w+')
#     file.write(f'{row[1]}')
#     file.close

# # secret files generation
# t = ['this is a secret','Hi','No this is def not the secret','welcome to my secret','secret secret secret secret','my secret is that im in love']
# for i in range(0,6,1):
#     file = open(f'datasets/search_files/{i}_secret.txt','w+')
#     file.write(f'{t[i]}')
    
    
# # we will try to searh for secret files hidding among hundreds of other file, also not all secret files are the real one