Search.py

In [None]:
import math
import numpy as np

"""
This algorithms have been taken from the paper:
Trotmam et al, Improvements to BM25 and Language Models Examined
"""
class search:
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                if word not in nd:
                    nd[word] = 0
                nd[word] += 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, query):
        raise NotImplementedError()

    def get_top_n(self, query, documents, n=5):

        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(query)
        top_n = np.argsort(scores)[::-1][:n]
        return top_n,[documents[i] for i in top_n]


class search_by_BM25(search):
    def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        super().__init__(corpus)

    def _calc_idf(self, nd):
        """
        Calculates frequencies of terms in documents and in corpus.
        This algorithm sets a floor on the idf values to eps * average_idf
        """
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        """
        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
        this algorithm also adds a floor to the idf value of epsilon.
        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
        :param query:
        :return:
        """
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

Auto-tagging-Script.py

In [None]:
# this file include source code of this beautiful repositry --> https://github.com/acrosson/nlp/tree/master/subject_extraction

import warnings
warnings.filterwarnings('ignore')
import os
import sqlite3
import re
import numpy as np
from gensim.parsing.preprocessing import STOPWORDS
from nltk.tokenize import word_tokenize, sent_tokenize

import nltk
from nltk.stem import WordNetLemmatizer

from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
import pickle

class SubjectTrigramTagger(object):
    """ Creates an instance of NLTKs TrigramTagger with a backoff
    tagger of a bigram tagger a unigram tagger and a default tagger that sets
    all words to nouns (NN)
    """

    def __init__(self, train_sents):
        """
        train_sents: trained sentences which have already been tagged.
                Currently using Brown, conll2000, and TreeBank corpuses
        """

        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)

    def tag(self, tokens):
        return self.tagger.tag(tokens)

class AutoTags:

    def __init__(self):
        self.NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']
        self.VERBS = ['VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ']

    def clean_document(self, document):
        """Remove enronious characters. Extra whitespace and stop words"""
        document = re.sub('[^A-Za-z .-]+', ' ', document)
        document = ' '.join(document.split())
        document = ' '.join([i for i in document.split() if i not in STOPWORDS])
        return document


    def tokenize_sentences(self, document):
        sentences = sent_tokenize(document)
        sentences = [word_tokenize(sent) for sent in sentences]
        return sentences


    def get_entities(self, document):
        """Returns Named Entities using NLTK Chunking"""
        entities = []
        sentences = self.tokenize_sentences(document)

        # Part of Speech Tagging
        sentences = [nltk.pos_tag(sent) for sent in sentences]
        for tagged_sentence in sentences:
            for chunk in nltk.ne_chunk(tagged_sentence):
                if type(chunk) == nltk.tree.Tree:
                    entities.append(' '.join([c[0] for c in chunk]).lower())
        return entities


    def word_freq_dist(self, document):
        """Returns a word count frequency distribution"""
        words = nltk.tokenize.word_tokenize(document)
        words = [word.lower() for word in words if word not in STOPWORDS]
        fdist = nltk.FreqDist(words)
        return fdist

    def extract_subject(self, document):
        # Get most frequent Nouns
        fdist = self.word_freq_dist(document)
        most_freq_nouns = [w for w, c in fdist.most_common(10)
                           if nltk.pos_tag([w])[0][1] in self.NOUNS]

        # Get Top 10 entities
        entities = self.get_entities(document)
        entities = list(set(entities))
        top_10_entities = [w for w, c in nltk.FreqDist(entities).most_common(10)]

        # Get the subject noun by looking at the intersection of top 10 entities
        # and most frequent nouns. It takes the first element in the list
        subject_nouns = [entity for entity in top_10_entities
                         if entity.split()[0] in most_freq_nouns]
        if len(subject_nouns) != 0:
            return subject_nouns[0]
        else:
            return ""


    # def trained_tagger(self,existing=False):
    #     """Returns a trained trigram tagger
    #     existing : set to True if already trained tagger has been pickled
    #     """
    #     if existing:
            # trigram_tagger = pickle.load(
            #     open(r'DataBase/trained_tagger.pkl', 'rb'))
            # return trigram_tagger

    #     # Aggregate trained sentences for N-Gram Taggers
    #     train_sents = nltk.corpus.brown.tagged_sents()
    #     train_sents += nltk.corpus.conll2000.tagged_sents()
    #     train_sents += nltk.corpus.treebank.tagged_sents()

    #     # Create instance of SubjectTrigramTagger and persist instance of it
    #     trigram_tagger = SubjectTrigramTagger(train_sents)
    #     pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))

    #     return trigram_tagger

    def merge_multi_word_subject(self, sentences, subject):
        """Merges multi word subjects into one single token
        ex. [('steve', 'NN', ('jobs', 'NN')] -> [('steve jobs', 'NN')]
        """
        if len(subject.split()) == 1:
            return sentences
        subject_lst = subject.split()
        sentences_lower = [[word.lower() for word in sentence]
                           for sentence in sentences]
        for i, sent in enumerate(sentences_lower):
            if subject_lst[0] in sent:
                for j, token in enumerate(sent):
                    start = subject_lst[0] == token
                    exists = subject_lst == sent[j:j + len(subject_lst)]
                    if start and exists:
                        del sentences[i][j + 1:j + len(subject_lst)]
                        sentences[i][j] = subject
        return sentences

    def tag_sentences(self, subject, document):
        """Returns tagged sentences using POS tagging"""
        trigram_tagger = pickle.load(open(r'DataBase/trained_tagger.pkl', 'rb'))

        # Tokenize Sentences and words
        sentences = self.tokenize_sentences(document)
        self.merge_multi_word_subject(sentences, subject)

        # Filter out sentences where subject is not present
        sentences = [sentence for sentence in sentences if subject in
                     [word.lower() for word in sentence]]

        # Tag each sentence
        tagged_sents = [trigram_tagger.tag(sent) for sent in sentences]
        return tagged_sents

    def get_svo(self, sentence, subject):
        """Returns a dictionary containing:
        subject : the subject determined earlier
        action : the action verb of particular related to the subject
        object : the object the action is referring to
        phrase : list of token, tag pairs for that lie within the indexes of
                    the variables above
        """
        subject_idx = next((i for i, v in enumerate(sentence)
                            if v[0].lower() == subject), None)
        data = {'subject': subject}
        for i in range(subject_idx, len(sentence)):
            found_action = False
            for j, (token, tag) in enumerate(sentence[i + 1:]):
                if tag in self.VERBS:
                    data['action'] = token
                    found_action = True
                if tag in self.NOUNS and found_action == True:
                    data['object'] = token
                    data['phrase'] = sentence[i: i + j + 2]
                    return data
        return {}


    def get_auto_tags_from_document(self, text, doc_id):
        document = text
        document = self.clean_document(document)

        entities = self.get_entities(document)

        if len(entities) == 0:
            auto_tags = word_tokenize(document.lower())
            auto_tags = list(set(auto_tags))
            # cleaning the auto tags futher for better search
            # auto_tags = clean_auto_tags(auto_tags)
            auto_tags = [word for word in auto_tags if word not in STOPWORDS and len(word) > 2]
            return str(auto_tags), ""

        entities = list(set(entities))

        # final list for storing automatic generated tags (auto_tags)
        auto_tags = entities

        subject = self.extract_subject(document)
        if subject == "":
            # If there is no subject then tokenize the summary to get better tags than tokenizing the whole documents

            # conn = sqlite3.connect("Document_finder_db2.db")
            # c = conn.cursor()
            # c.execute(f"SELECT summary FROM document_summary where doc_id='{doc_id}' ")
            # temp_summary = c.fetchone()
            # conn.close()
            #
            # temp_tags = list(set(word_tokenize(temp_summary[0])))
            # auto_tags += temp_tags
            # auto_tags = [word for word in auto_tags if word not in STOPWORDS and len(word) > 2]
            # return str(auto_tags), ""
            return str(auto_tags), ""

        tagged_sents = self.tag_sentences(subject, document)

        svos = [self.get_svo(sentence, subject)
                for sentence in tagged_sents]
        svos_list = []
        for svo in svos:
            if svo:
                svo_word = svo["subject"] + " " + svo["action"] + " " + svo["object"]
                svos_list.append(svo_word)

                auto_tags.append(svo_word)
                for num in svo["phrase"]:
                    temp_word = num[0].lower()
                    if temp_word not in entities and temp_word not in STOPWORDS:
                        auto_tags.append(temp_word)

        auto_tags = [word for word in auto_tags if word not in STOPWORDS and len(word) > 2]
        return str(auto_tags), str(svos_list)

Final fulldb scrapit.py

In [None]:
import warnings
warnings.filterwarnings('ignore')

import re
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity


import PyPDF2
from docx import Document

import textract


def writeTofile(data, filename):
    # Convert binary data to proper format and write it on Hard Disk
    with open(filename, 'wb') as file:
        file.write(data)
    print("Stored blob data into: ", filename, "\n")

valid_extensions = {'docx', 'pptx', 'txt', 'pdf'}

class PreProcess:

    def __init__(self, file):
        self.file = file

    def check_extension(self):
        if self.file.split('.')[-1] in valid_extensions:
            return True
        else:
            return False


    def get_extension(self):
        return self.file.split('.')[-1]


    def get_text_from_docx_document(self):
        try:
            doc = Document(self.file)
            temp = ''
            for para in doc.paragraphs:
                temp += para.text
            return temp
        except Exception:
            print('Raising......')
            raise Exception

    #     text = textract.process(file)
    #     text = str(text)[2:]
    #     return text

    def get_text_from_pdf_document(self):
        file_obj = open(self.file, "rb")
        pdf_reader = PyPDF2.PdfFileReader(file_obj)
        page_numbers = pdf_reader.numPages
        temp = ''

        for i in range(page_numbers):
            page_obj = pdf_reader.getPage(i)
            temp += page_obj.extractText()
        file_obj.close()
        return temp


    def get_text_from_txt_document(self):
        #     text = textract.process(file)
        #     text = str(text)[2:]
        try:
            f = open(self.file, "r")
            temp = f.read()

        except UnicodeDecodeError:
            #         print("\n\nI am in except block\n\n")
            try:
                f = open(self.file, "r", encoding="utf-8")
                temp = f.read()
            except:
                print("Sorry! can't decode encodings!")
                raise Exception("Sorry! can't decode bytes")
        # except Exception:
        #     print('Another exception occured')
        #     raise Exception
        finally:
            f.close()
            return temp



    def get_text_from_pptx_document(self):
        text = textract.process(self.file)
        text = str(text)[2:]
        return text


    def remove_escape_sequences(self, text):
        pattern = r"\\[a-z]"
        text = re.sub(pattern, " ", text)
        return text


def load_word_embeddings():
    global word_embeddings
    word_embeddings = {}
    f = open(r'DataBase/glove.6B.100d.txt', encoding="utf-8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()


def cleaning_for_summarization(text):
    pattern = r"((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)"
    text = re.sub(pattern, " ", text)

    sentences = sent_tokenize(text)
    #     for j in range(len(sentences)):
    #         sentences[j] = re.sub("[^a-zA-Z]"," ",sentences[j])

    clean_sentences = sentences

    for j in range(len(clean_sentences)):
        clean_sentences[j] = word_tokenize(clean_sentences[j])

    return clean_sentences


def get_summary(text, word_embeddings):
    tokenized_sent = cleaning_for_summarization(text)

    sentence_vectors = []
    for i in tokenized_sent:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i]) / (len(i) + 0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)

    # similarity matrix
    sim_mat = np.zeros([len(tokenized_sent), len(tokenized_sent)])

    for i in range(len(tokenized_sent)):
        for j in range(len(tokenized_sent)):
            if i != j:
                sim_mat[i][j] = \
                cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0]

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    ranked_sentence = sorted(((scores[i], s) for i, s in enumerate(tokenized_sent)), reverse=True)
    summarize_text = []
    if len(ranked_sentence) == 1:
        summarize_text.append(" ".join(ranked_sentence[0][1]))
    elif len(ranked_sentence) == 0:
        summarize_text = []
    else:
        for i in range(2):
            summarize_text.append(" ".join(ranked_sentence[i][1]))

    if len(". ".join(summarize_text)) > 1400:
        summary = summarize_text[0]
    else:
        summary = ". ".join(summarize_text)

    return summary

Ready for search.py

In [None]:
import sqlite3
import pickle

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import STOPWORDS
import re

class MakeDataForSearch:
    def __init__(self, data, titles,summaries,documents,svos):
        self.data, self.titles, self.summaries, self.documents, self.svos = self.get_all_texts_summary_titles_documents()

    def fetch_all_texts(self):
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute("SELECT text from document_info")
        tup = c.fetchall()
        conn.close()
        return tup

    def fetch_all_titles(self):
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute("SELECT title from document_info")
        tup = c.fetchall()
        conn.close()
        return tup
    def fetch_all_summary(self):
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute("SELECT summary from document_summary")
        tup = c.fetchall()
        conn.close()
        return tup
    def fetch_all_svos(self):
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute("SELECT auto_tags from document_tags")
        tup = c.fetchall()
        conn.close()
        return tup
    def fetch_all_documentsWithExtensions(self):
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute("SELECT document,extension from document_info")
        tup = c.fetchall()
        conn.commit()
        conn.close()
        return tup

    def get_all_texts_summary_titles_documents(self):
        svos_file = []
        texts = self.fetch_all_texts()
        titles = self.fetch_all_titles()
        summaries = self.fetch_all_summary()
        svos = self.fetch_all_svos()
        tup = self.fetch_all_documentsWithExtensions()

        data_file = [text[0] for text in texts]
        title_file = [title[0] for title in titles]
        summary_file = [summary[0] for summary in summaries]

        for i in range(len(svos)):
            lst = svos[i][0][1:-1].split("'")
            lst = [word for word in lst if word not in STOPWORDS and len(word) > 2]
            svo = [word for word in lst if len(word.split()) > 1]
            svos_file.append(svo)

        blob_list = [tup[k][0] for k in range(len(tup))]
        entension_list = [tup[k][1] for k in range(len(tup))]
        index_list = [i for i in range(len(tup))]

        dictionary = {k: {"document": x, "extension": y} for (k, x, y) in zip(index_list, blob_list, entension_list)}

        return data_file, title_file, summary_file, dictionary, svos_file


def get_corpus(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    pattern = r"((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)"
    text = str(text)
    text = re.sub(pattern, " ", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    text = word_tokenize(text)
    # return text
    text = [word for word in text if word not in STOPWORDS]
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in text if len(word) > 2]
    lemmed = [WordNetLemmatizer().lemmatize(word, pos='v') for word in lemmed]
    return lemmed


def get_latest_text_title():
    conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
    c = conn.cursor()
    c.execute("SELECT title,text from document_info where rowid = (SELECT MAX(rowid) FROM document_info)")
    tup = c.fetchall()
    conn.close()
    return tup

def get_latest_tags():
    conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
    c = conn.cursor()
    c.execute("SELECT manual_tags,auto_tags from document_tags where rowid = (SELECT MAX(rowid) FROM document_tags)")
    tup = c.fetchone()
    conn.close()
    return tup


# corpus = []
# for i in range(len(data)):
#     corpus.append(apply_all(titles[i]) + apply_all(data[i]))

# pickle.dump(corpus, open("corpus_file.pkl","wb"))

# if __name__ == '__main__':
def maintaining_all_files():
    data = []
    titles = []
    summaries = []
    documents = {}
    svos = []
    obj = MakeDataForSearch(data, titles, summaries, documents, svos)

    pickle.dump(obj.data, open(r"DataBase/data_file.pkl", "wb"))
    pickle.dump(obj.titles, open(r"DataBase/title_file.pkl", "wb"))
    pickle.dump(obj.summaries, open(r"DataBase/summary_file.pkl", "wb"))
    pickle.dump(obj.documents, open(r"DataBase/document_file.pkl", "wb"))
    pickle.dump(obj.svos, open(r"DataBase/svos_file.pkl", "wb"))

    print("Files is updated")

    # appending to the main corpus
    corpus = pickle.load(open(r"DataBase/corpus_file.pkl", "rb"))

    temp = get_latest_text_title()
    text = temp[0][1]
    title = temp[0][0]
    corpus.append(get_corpus(text)+get_corpus(title))

    pickle.dump(corpus, open(r"DataBase/corpus_file.pkl", "wb"))

    # appending to the tags corpus
    final_auto_tags = pickle.load(open(r"DataBase/tags_pickle.pkl", "rb"))

    tup = get_latest_tags()
    temp_auto_tags = tup[1][1:-1].split("'")
    temp_manual_tags = tup[0][1:-1].split("'")

    auto_tags = temp_manual_tags + temp_auto_tags
    auto_tags = [word for word in auto_tags if word not in STOPWORDS and len(word) > 2]

    final_auto_tags.append(auto_tags)

    pickle.dump(final_auto_tags, open(r"DataBase/tags_pickle.pkl", "wb"))

    # appending to title corpus
    title_corpus = pickle.load(open(r"DataBase/title_corpus.pkl", "rb"))
    title_corpus.append(get_corpus(title))

    pickle.dump(title_corpus, open(r"DataBase/title_corpus.pkl", "wb"))


    print("corpus is updated")


Search by default feature

In [None]:
from Search import search_by_BM25

import nltk
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

from gensim.parsing.preprocessing import STOPWORDS

from spellchecker import SpellChecker

def clean_query(query):
    '''
    Function to perform lemmatization and cleaning on query
    '''
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(query) if word not in STOPWORDS]
    lemmed = [WordNetLemmatizer().lemmatize(word, pos='v') for word in lemmed]
    lemmed = list(set(lemmed))

    # applying spell checker on tags
    spell = SpellChecker()
    misspelled = spell.unknown(lemmed)
    if len(misspelled) == 0:
        return lemmed
    else:
        correct_words = list(set(lemmed) - misspelled)
        correction = []

        for word in misspelled:
            # Get the one `most likely` answer
            correction.append(spell.correction(word))
        new_query = query
        for i in range(len(correction)):
            new_query = new_query.replace(list(misspelled)[i], correction[i])

        # cleaned auto_tags
        lemmed = correct_words + correction
        print(f"Searching for {new_query} instead of {query}")
        return lemmed

if __name__ == '__main__':
    data = pickle.load(open(r"DataBase/data_file.pkl", "rb"))
    titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))

    option = input("Enter option of search")

    if option == 'default search':
        corpus = pickle.load(open(r"DataBase/corpus_file.pkl", "rb"))
    elif option == 'tag search':
        corpus = pickle.load(open(r"DataBase/tags_pickle.pkl", "rb"))
    elif option == 'title search':
        corpus = pickle.load(open(r"DataBase/title_corpus.pkl", "rb"))
    else:
        print('Not valid option')

    bm25 = search_by_BM25(corpus)
    query = input("Enter query")
    tokenized_query = clean_query(query.lower())

    indexes, results = bm25.get_top_n(tokenized_query, data, n=20)
    results_titles = []
    for i in indexes:
        results_titles.append(titles[i])

    for i in range(10):
        print(f"Title_{i}: {results_titles[i]}")
        print(f"\nText_{i}: {results[i]}")
        print('\n\n')

app.py

In [None]:
from flask import Flask, flash, request, redirect, render_template
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField, FileField
import os
import urllib.request
import sqlite3

from werkzeug.utils import secure_filename
import re
from Search import search_by_BM25

import pickle
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import STOPWORDS

from spellchecker import SpellChecker
import random
from docx import Document
from auto_tagging_script import AutoTags

from final_script_fulldb import load_word_embeddings, cleaning_for_summarization, get_summary, writeTofile
from final_script_fulldb import PreProcess, valid_extensions
from main import *
from ready_for_search import *
def get_text_from_docx_document(file):
    try:
        doc = Document(file)
        temp = ''
        for para in doc.paragraphs:
            temp += para.text
        return temp
    except Exception:
        print('Raising......')
        raise Exception
def clean_query(query):
    '''
    Function to perform lemmatization and cleaning on query
    '''
    query = re.sub("'s", "", query)
    query = re.sub("s'", "", query)
    query = re.sub("n't", " not", query)
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in word_tokenize(query) if word not in STOPWORDS]
    lemmed = [WordNetLemmatizer().lemmatize(word, pos='v') for word in lemmed]
    lemmed = list(set(lemmed))

    # applying spell checker on tags
    spell = SpellChecker()
    misspelled = spell.unknown(lemmed)
    new_query = query
    if len(misspelled) == 0:
        return lemmed, query, new_query
    else:
        correct_words = list(set(lemmed) - misspelled)
        correction = []

        for word in misspelled:
            # Get the one `most likely` answer
            correction.append(spell.correction(word))

        for i in range(len(correction)):
            new_query = new_query.replace(list(misspelled)[i], correction[i])


        # cleaned auto_tags
        lemmed = correct_words + correction
        print(f"Searching for {new_query} instead of {query}")
        return lemmed, query, new_query


app = Flask(__name__)
app.config['SECRET_KEY'] = 'Hard to guess string'


app.config['MAX_CONTENT_LENGTH	'] = 1024 * 1024 * 1024
ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'docx', 'pptx'])


def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS



@app.route('/')
def index():
    return render_template('index.html')

@app.route('/searchByTag', methods = ['POST', 'GET'])
def viewSearchbyTag():
    if request.method == 'POST':
        mystring = "Tag"
        query = request.form['namesearchbytag']

        data = pickle.load(open(r"DataBase/data_file.pkl", "rb"))
        titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))
        auto_tag = pickle.load(open(r"DataBase/svos_file.pkl", "rb"))
        summary = pickle.load(open(r"DataBase/summary_file.pkl", "rb"))

        corpus = pickle.load(open(r"DataBase/tags_pickle.pkl", "rb"))
        bm25 = search_by_BM25(corpus)

        tokenized_query, old_query, new_query = clean_query(query.lower())

        indexes, results = bm25.get_top_n(tokenized_query, data, n=5)
        results_titles = []
        results_summaries = []
        results_tags = []

        for i in indexes:
            results_titles.append(titles[i])
            results_summaries.append(summary[i])
            if auto_tag[i] != []:
                results_tags.append(list(set(random.choices(auto_tag[i], k=3))))
            else:
                results_tags.append(['No Auto tags'])
        text = []
        for i in results:
            text_to_show = " ".join(sent_tokenize(i)[:2])
            if text_to_show != '':
                text.append(text_to_show + '....')
            else:
                text.append(i)
        # text = results
        title = results_titles
        summaries = results_summaries
        tags = results_tags

        title_len = len(title)

        document_file = pickle.load(open(r"DataBase/document_file.pkl", "rb"))
        extension_list = []
        for i in indexes:
            extension_list.append(document_file[i]["extension"])


        return render_template('searchbyText.html', text=text, tag=query, title=title, summaries=summaries, tags=tags,
                                   type=mystring ,title_len = title_len, old_query=old_query, new_query=new_query,extension_list=extension_list)



@app.route('/searchByText', methods = ['POST', 'GET'])
def viewSearchbyText():
    if request.method == 'POST':
        mystring = "Text"
        query = request.form['namesearchbytext']

        data = pickle.load(open(r"DataBase/data_file.pkl", "rb"))
        titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))
        auto_tag = pickle.load(open(r"DataBase/svos_file.pkl", "rb"))
        summary = pickle.load(open(r"DataBase/summary_file.pkl", "rb"))

        corpus = pickle.load(open(r"DataBase/corpus_file.pkl", "rb"))
        bm25 = search_by_BM25(corpus)

        tokenized_query, old_query, new_query = clean_query(query.lower())

        indexes, results = bm25.get_top_n(tokenized_query, data, n=5)
        results_titles = []
        results_summaries = []
        results_tags = []

        for i in indexes:
            results_titles.append(titles[i])
            results_summaries.append(summary[i])
            if auto_tag[i] != []:
                results_tags.append(list(set(random.choices(auto_tag[i], k=3))))
            else:
                results_tags.append(['No Auto tags'])
        text = []
        for i in results:
            text_to_show = " ".join(sent_tokenize(i)[:2])
            if text_to_show != '':
                text.append(text_to_show + '....')
            else:
                text.append(i)
        # text = results
        title = results_titles
        summaries = results_summaries
        tags = results_tags

        title_len = len(title)

        document_file = pickle.load(open(r"DataBase/document_file.pkl", "rb"))
        extension_list = []
        for i in indexes:
            extension_list.append(document_file[i]["extension"])

        # return render_template('searchbyText.html', text=text, tag=query, title=title, summaries=summaries, tags=tags, title_len = title_len)
        return render_template('searchbyText.html', text=text, tag=query, title=title, summaries=summaries, tags=tags,
                           type=mystring,title_len=title_len, old_query=old_query, new_query=new_query,extension_list=extension_list)


@app.route('/searchByTitle', methods=['POST', 'GET'])
def viewSearchbyTitle():
    if request.method == 'POST':
        mystring = "Title"
        query = request.form['namesearchbytitle']
        data = pickle.load(open(r"DataBase/data_file.pkl", "rb"))
        titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))
        auto_tag = pickle.load(open(r"DataBase/svos_file.pkl", "rb"))
        summary = pickle.load(open(r"DataBase/summary_file.pkl", "rb"))

        corpus = pickle.load(open(r"DataBase/title_corpus.pkl", "rb"))
        bm25 = search_by_BM25(corpus)

        tokenized_query, old_query, new_query = clean_query(query.lower())

        indexes, results = bm25.get_top_n(tokenized_query, data, n=5)
        results_titles = []
        results_summaries = []
        results_tags = []

        for i in indexes:
            results_titles.append(titles[i])
            results_summaries.append(summary[i])
            if auto_tag[i] != []:
                results_tags.append(list(set(random.choices(auto_tag[i], k=3))))
            else:
                results_tags.append(['No Auto tags'])
        text = []
        for i in results:
            text_to_show = " ".join(sent_tokenize(i)[:2])
            if text_to_show != '':
                text.append(text_to_show + '....')
            else:
                text.append(i)

        # text = results
        title = results_titles
        summaries = results_summaries
        tags = results_tags

        title_len = len(title)

        document_file = pickle.load(open(r"DataBase/document_file.pkl", "rb"))
        extension_list = []
        for i in indexes:
            extension_list.append(document_file[i]["extension"])

        # return render_template('searchbyTitle.html', text=text, tag=query, title=title, summaries=summaries, tags=tags, title_len = title_len)
        return render_template('searchbyText.html', text=text, tag=query, title=title, summaries=summaries, tags=tags,
                                type=mystring,title_len=title_len, old_query=old_query, new_query=new_query,extension_list=extension_list)


@app.route('/', methods=['POST'])
def upload_file():
    if request.method == 'POST':
        if 'files[]' not in request.files:
            return redirect(request.url)
        files = request.files.getlist(r'files[]')
        print(files[0].filename)
        file_upload = files[0].filename

        # taking filename as a title
        title = " ".join(file_upload.split('.')[:-1])

        try:
            for file in files:
                if file and allowed_file(file.filename):
                    filename = secure_filename(file.filename)
                    file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

            # go to that file and read it
            file_upload = os.path.join(app.config['UPLOAD_FOLDER'], filename)
            print(file_upload)
            main(file_upload, title)
            # after completion of processsing delete that file from folder.
            os.remove(file_upload)

            # I know this way of doing it, is very wrong, It's more like a cheating. But I have done this for a particular reason
            # I will change it after sometime.

            return redirect('/')

        except Exception:
            print("Hello")
            return redirect('/')




var_path = ""
@app.route('/path', methods=['POST'])
def choose():
    app.config['UPLOAD_FOLDER'] = ""
    global var_path
    var_path = request.form.get('folder_path')

    print(var_path)
    app.config['UPLOAD_FOLDER'] = var_path
    #print(app.config['UPLOAD_FOLDER'])

    return redirect('/')


@app.route('/nopage')
def noaccountpagefunction():
    return render_template('nopage.html')


# tempdiv = ""
myclassname = ""


@app.route('/filenameonclick', methods=['GET', 'POST'])
def filenameonclick():
    if request.method == 'POST':
        if os.path.exists(var_path):

            myclassname = request.form['myclassname']
            print(myclassname)
            titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))
            for i in range(len(titles)):
                if titles[i] == myclassname:
                    index = i
                    break

            document_file = pickle.load(open(r"DataBase/document_file.pkl", "rb"))

            blob_data = document_file[index]["document"]
            extension = document_file[index]["extension"]

            if len(myclassname) > 80:
                myclassname = myclassname[:80]

            punctuations = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
            for x in myclassname:
                if x in punctuations:
                    myclassname = myclassname.replace(x, "")

            print(myclassname)
            file_name = myclassname + '.' + extension
            file_name = os.path.join(var_path, file_name)

            writeTofile(blob_data, file_name)

            return render_template('redirect.html', myclassname=myclassname)
        else:
            mymessage = "Please enter the working directory for current session."
            return render_template('checkworking.html' , mymessage=mymessage)


if __name__ == '__main__':
    app.run()

main.py

In [None]:
import sqlite3
import os
import pickle
from auto_tagging_script import SubjectTrigramTagger
from auto_tagging_script import AutoTags

from final_script_fulldb import load_word_embeddings, cleaning_for_summarization, get_summary
from final_script_fulldb import PreProcess, valid_extensions
from ready_for_search import *
print('imported')

def convertToBinaryData(file):
    #Convert digital data to binary format
    with open(file, 'rb') as file:
        blobData = file.read()
    return blobData


def insert_data_to_database(doc_id, title, text, file, extension, summary, auto_tags, manual_tags, svos):
    try:
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()

        sqlite_insert_blob_query1 = """ INSERT INTO document_info
                                              (doc_id, title, text, document,extension) VALUES (?, ?, ?, ?, ?)"""

        document = convertToBinaryData(file)
        # Convert data into tuple format
        data_tuple1 = (doc_id, title, text, document, extension)
        c.execute(sqlite_insert_blob_query1, data_tuple1)

        sqlite_insert_blob_query2 = """ INSERT INTO document_summary
                                              (doc_id, summary) VALUES (?, ?)"""

        # Convert data into tuple format
        data_tuple2 = (doc_id, summary)
        c.execute(sqlite_insert_blob_query2, data_tuple2)

        sqlite_insert_blob_query3 = """ INSERT INTO document_tags
                                          (doc_id, title, auto_tags, manual_tags,svos) VALUES (?, ?, ?, ?, ?)"""

        # Convert data into tuple format
        data_tuple3 = (doc_id, title, auto_tags, manual_tags, svos)
        c.execute(sqlite_insert_blob_query3, data_tuple3)

        conn.commit()
        print("file and data inserted successfully into a table")
        conn.close()

        # call maintaining_all_files() fn for updating all files for search.
        maintaining_all_files()

    except sqlite3.Error as error:
        conn.rollback()
        print("Failed to insert data into sqlite table", error)
        raise Exception
    finally:
        if (conn):
            conn.close()
            # print("the sqlite connection is closed")

def get_last_inserted_rowid():
    try:
        conn = sqlite3.connect(r"DataBase/Document_finder_db2.db")
        c = conn.cursor()
        c.execute('''SELECT MAX(rowid) FROM document_info''')
        tup = c.fetchone()
        conn.close()
        return tup[0]
    except Exception:
        print('Cannot access the database right now')

def main(file_upload, title):

    # load_word_embeddings()
    # print('loaded')
    global word_embeddings
    word_embeddings = pickle.load(open(r"word_embeddings.json", "rb"))
    
    preprocess_obj = PreProcess(file_upload)

    if preprocess_obj.check_extension():

        extension = preprocess_obj.get_extension()

        if extension == 'docx':
            text = preprocess_obj.get_text_from_docx_document()
            text = preprocess_obj.remove_escape_sequences(text)
            
        elif extension == 'pptx':
            text = preprocess_obj.get_text_from_pptx_document()
            text = preprocess_obj.remove_escape_sequences(text)

        elif extension == 'pdf':
            text = preprocess_obj.get_text_from_pdf_document()
            text = preprocess_obj.remove_escape_sequences(text)

        else:
            text = preprocess_obj.get_text_from_txt_document()
            text = preprocess_obj.remove_escape_sequences(text)

        #doc_id = str(file_upload.split('\\')[-1]).replace('.' + extension, "")  # name of file(in local directory) as doc_id

        # title = data.title[int(re.findall("_[0-9]+",doc_id)[0][1:])-1]

        doc_id = f'news_{get_last_inserted_rowid()+15}'                # doc_id = 'news_5782'
        # doc_id = 'news_5783'
        print(doc_id)
        # title = input("Enter Title")
        # text = text
        print(text)
        summary = get_summary(text, word_embeddings)
        print(summary)

        # manual_tags = str(list(map(str, input("Enter manual tags").split("  "))))
        manual_tags = ""
        auto_tags_obj = AutoTags()
        auto_tags, svos = auto_tags_obj.get_auto_tags_from_document(text, doc_id)

        assert type(auto_tags) == type(svos) == str, r"tags cannot be inserted into table as its data type doesn't match the database's data type"

        print(auto_tags)
        print(manual_tags)

        insert_data_to_database(doc_id, title, text, file_upload, extension, summary, auto_tags, manual_tags, svos)


    else:
        print('Invalid Extension')

# main()

Document Similarity

In [None]:
from gensim.models import LdaModel
from gensim import models, similarities
import numpy as np
from gensim.corpora import Dictionary
import time
import pickle
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon

print("Imported")
corpus = pickle.load(open(r"Database\corpus_file.pkl", "rb"))


def train_lda(corpus):
    num_topics = 100
    chunksize = 1000
    dictionary = Dictionary(corpus)
    lda_corpus = [dictionary.doc2bow(doc) for doc in corpus]

    t1 = time.time()

    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa

    lda = LdaModel(corpus=lda_corpus, num_topics=num_topics, id2word=dictionary,
                   alpha=1e-2, eta=5e-2, chunksize=chunksize,
                   minimum_probability=0.0, passes=2)

    t2 = time.time()
    print("Time to train LDA model on ", len(corpus), "articles", (t2 - t1) / 60, "min")
    return dictionary, lda_corpus, lda


def jensen_shannon(query, matrix):
    p = query[:, None]  # original shape of query was (100,) , which means ---> (number of topics,)
    print(p.shape)  # shape becomes (100,1)

    q = matrix.T  # transpose matrix
    print(q.shape)  # shape --> (number of topics, total documents)
    return jensenshannon(p, q)

    # m = 0.5 * (p + q)
    # return np.sqrt(0.5 * (entropy(p, m) + entropy(q, m)))


def execute_training_of_lda(corpus):
    dictionary, lda_corpus, lda = train_lda(corpus)

    t3 = time.time()
    doc_topic_dist = np.array([[tup[1] for tup in lst] for lst in lda[lda_corpus]])
    t4 = time.time()

    print("Time to get topic distribution", (t4 - t3) / 60, "min")
    pickle.dump(dictionary, open(r'LdaModel/dictionary.pkl', "wb"))
    pickle.dump(doc_topic_dist, open(r'LdaModel/doc_topic_distribution.pkl', "wb"))
    lda.save(r'LdaModel/model')
    print("All files are ready----")

def get_most_similar_documents(query,matrix,k=10):
    """
    This function implements the Jensen-Shannon distance above
    and retruns the top k indices of the smallest jensen shannon distances
    """
    sims = jensen_shannon(query, matrix)    # list of jensen shannon distances
    print(max(sims))
    print(sorted(sims, reverse=True)[:10])
    return sims.argsort()[:k]   # the top k positional index of the smallest Jensen Shannon distances

def get_similar_documents(doc_corpus):
    dictionary = pickle.load(open(r"LdaModel/dictionary.pkl", "rb"))
    doc_topic_dist = pickle.load(open(r"LdaModel/doc_topic_distribution.pkl", "rb"))
    lda = LdaModel.load(r"LdaModel/model")
    titles = pickle.load(open(r"DataBase/title_file.pkl", "rb"))

    bow = dictionary.doc2bow(doc_corpus)
    doc_distribution = np.array([tup[1] for tup in lda.get_document_topics(bow=bow)])

    most_sim_ids = get_most_similar_documents(doc_distribution, doc_topic_dist)
    return most_sim_ids
    # print()
    # for i in most_sim_ids:
    #     print(titles[i])
    #     print("===================")


# if __name__ == '__main__':
#
#     # t1 = time.time()
#     # print("Starting training...")
#     # execute_training_of_lda(corpus)
#     # t2 = time.time()
#     # print("Total time taken: ", (t2-t1)/60, "min")
#
#     doc_corpus = corpus[78]
#     get_similar_documents(doc_corpus)

check.py

In [None]:
n = int(input())
arr = list(map(int, input().split()))


def func(n, arr):
    global swaps
    swaps = 0
    for i in range(n-1, 1, -1):
        if arr[i] == i + 1:
            continue
        elif arr[i - 1] == i + 1:
            arr[i], arr[i - 1] = arr[i - 1], arr[i]
            swaps += 1
        elif arr[i - 2] == i + 1:
            arr[i - 1], arr[i - 2] = arr[i - 2], arr[i - 1]
            arr[i], arr[i - 1] = arr[i - 1], arr[i]
            swaps += 2
        else:
            return False

    if arr[1] == 2:
        return True
    elif arr[0] == 2 and arr[1] == 1:
        swaps += 1
        arr[0], arr[1] = arr[0], arr[1]
        return True
    else:
        return False

if func(n, arr):
    print("YES")
    print(swaps)
else:
    print("NO")

Project.py

In [None]:
from tkinter import filedialog
import tkinter as tk
import tkinter.messagebox
from app import *
from main import *
import multiprocessing


def on_click(event):
    filepath.configure(state=tk.NORMAL)
    filepath.delete(0, tk.END)

    # make the callback only work once
    filepath.unbind('<Button-1>', on_click_id)


def first_window():
    global window0
    window0 = tk.Tk()
    text2 = tk.Text(window0, height=25, width=150,)
    scroll = tk.Scrollbar(window0, command=text2.yview)
    text2.configure(yscrollcommand=scroll.set)
    text2.tag_configure('bold_italics', font=('Arial', 20, 'bold', 'italic'))
    text2.tag_configure('big', font=('Verdana', 20, 'bold'))
    text2.tag_configure('color',
                        foreground='#476042',
                        font=('Tempus Sans ITC', 12, 'bold'))


    text2.insert(tk.END, '\nDocument Search Manager\n', 'big')
    quote = """
    How easy do you find it to remember the exact location of a document that you created last year?
    Not very easy, right?
    Big Organizations/people deal with hundreds of documents daily and forget about them, most of the time. 
    But what if we want that old documentation again for some work,
    but unfortunately you do not remember the name or the actual content of that document to retrieve it from the large storage of your computer. 
    In such cases, use of a Intelligent document finder can really make a huge difference.
    """
    text2.insert(tk.END, quote, 'color')
    labelfont = ('times', 40, 'bold')
    text2.config(bg='black', fg='yellow')
    text2.config(font=labelfont)
    text2.config(height=10, width=40)
    text2.pack(expand=True, fill=tk.BOTH)

    text2.pack(fill=tk.BOTH, anchor="e")
    scroll.pack(fill=tk.Y)

    contentfont = ('times', 20, 'bold')
    contentframe = tk.Label(window0, text="Do you want to upload a file or search for a file?")
    contentframe.config(font=contentfont)
    contentframe.config(bg='black', fg='aqua')
    contentframe.pack(expand=True, fill=tk.BOTH)
    contentframe.pack()

    uploadbutton=tk.Button(window0, text="Upload file", relief=tk.GROOVE, borderwidth=5, command=upload_window)
    uploadfont = ('times', 25, 'bold')
    uploadbutton.config(font=uploadfont)
    uploadbutton.config(bg='black', fg='aqua')
    uploadbutton.pack(side= tk.LEFT, expand=True, fill=tk.BOTH, padx=15, pady=15)

    searchbutton=tk.Button(window0, text="Search file", relief=tk.GROOVE, borderwidth=5, command=main_func)
    searchfont = ('times', 25, 'bold')
    searchbutton.config(font=searchfont)
    searchbutton.config(bg='black', fg='aqua')
    searchbutton.pack(side=tk.RIGHT, expand=True, fill=tk.BOTH, padx=15, pady=15)

    window0.mainloop()
def print_path():
    global title, f
    f = tk.filedialog.askopenfilename(
        parent=window, initialdir='C:/Tutorial',
        title='Choose file',
        filetypes=[('txt files', '.txt'),
                   ('all files', '*')]
    )

    filename = os.path.basename(f)
    fileupload.config(state=tk.NORMAL)
    fileupload.insert(0, filename)
    fileupload.config(state=tk.DISABLED)
    print(f)
    title = ".".join(filename.split('.')[:-1])
    print(title)




def delete_path():
    fileupload.config(state=tk.NORMAL)
    fileupload.delete(0, tk.END)
    fileupload.config(state=tk.DISABLED)



def upload_file():

    var_path = filepath.get()
    flag = True
    if not os.path.exists(var_path):
        tk.messagebox.showerror(title='Path Error', message='Entered Directory does not exists!! Please Enter Valid Path')
        filepath.delete(0, tk.END)
        flag = False
    try:
        if not os.path.exists(f):
            print('f: ',f)
            tk.messagebox.showerror(title='File Not Found', message='Please Upload Correct File on System first!! By click on upload file button')
            flag = False
        else:
            if f.split('.')[-1] not in ALLOWED_EXTENSIONS:
                print(f)
                tk.messagebox.showerror(title='Error', message='File Format is not Supported!! Please upload files of only docx,txt,pdf or pptx format')
                flag = False
    except NameError:
        tk.messagebox.showerror(title='Error',
                                message='You have not choosen any file yet!!')
        flag = False

    if flag == True:
        with open("path.txt", "w") as file:
            file.write(var_path)

        main(f, title)
        tk.messagebox.showinfo("Successful", "File Successfully uploaded!")


# def main_func2():
#     # p1 = subprocess.run(['python', 'app.py'], shell=True)
#     # p1.terminate()
#     # p1.kill()
#     p2 = multiprocessing.Process(target=main_func())
#     p2.run()
#     # p2.start()
#     # sleep(5)
#     # p2.terminate()
#
#     # p2.join()
#
placeholder = "Enter Working Directory of this System"


def upload_window():
    global window
    window = tk.Toplevel(window0)
    # window = tk.Tk()
    # window0.withdraw()
    window.title("Upload File")

    titleframe = tk.Label(window, text="Click on the Upload file button and browse the file that you want to upload.",
                          relief=tk.RAISED)
    labelfont = ('times', 20, 'bold')
    titleframe.config(bg='black', fg='yellow')
    titleframe.config(font=labelfont)
    titleframe.config(height=10, width=80)
    titleframe.pack(expand=True, fill=tk.BOTH)
    titleframe.pack()

    fp = tk.Frame(window)
    fp.pack()

    global filepath, on_click_id, var_path
    filepath = tk.Entry(fp, font=("Times New Roman", 15, "bold"), justify='center')
    filepath.config(width=120, bd=5, relief=tk.SUNKEN)
    filepath.insert(0, placeholder)
    filepath.config(state=tk.DISABLED)

    on_click_id = filepath.bind('<Button-1>', on_click)
    # path = filepath.get()
    filepath.pack(side=tk.LEFT)

    # var_path = path

    buttons = tk.Frame(window)
    buttons.pack(side=tk.LEFT)

    uploadbutton2 = tk.Button(buttons, text='Upload file', command=print_path, width=20)
    buttonfont = ('times', 25, 'bold')
    uploadbutton2.config(font=buttonfont)
    uploadbutton2.config(bg='black', fg='aqua')
    uploadbutton2.pack(expand=True, fill=tk.BOTH, padx=15, pady=15)

    removefile = tk.Button(buttons, text="Remove file", command=delete_path, relief=tk.GROOVE, borderwidth=5, width=20)
    removefont = ('times', 25, 'bold')
    removefile.config(font=removefont)
    removefile.config(bg='black', fg='aqua')
    removefile.pack(side=tk.LEFT, expand=True, fill=tk.BOTH, padx=15, pady=15)

    upent = tk.Frame(window)
    upent.pack()
    global fileupload
    fileupload = tk.Entry(upent, width=60, bd=5, font=("Times New Roman", 14, "bold"), justify='center',
                          state=tk.DISABLED)
    fileupload.config(fg='brown')
    fileupload.pack(side=tk.LEFT)

    upbutton = tk.Button(upent, text="Upload", command=upload_file, relief=tk.GROOVE, borderwidth=5)
    uploadfont2 = ('times', 25, 'bold')
    upbutton.config(font=uploadfont2)
    upbutton.config(bg='black', fg='aqua')
    upbutton.pack(expand=True, fill=tk.BOTH, padx=15, pady=15)

    # window.mainloop()




# p1 = multiprocessing.Process(target=first_window())
# # p2 = multiprocessing.Process(target=app.run)
# #
# p1.start()
# #
# # # p2. join()
# p1.join()
first_window()
# upload_window()

Data preparation.py

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
from gensim.parsing.preprocessing import STOPWORDS

import re
from nltk.stem import WordNetLemmatizer
import time
from nltk import FreqDist
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
from google.colab import drive

In [None]:
table[table.duplicated(subset=['Title'])]

In [None]:
table[table.Title=="Revenue"]

In [None]:
table.drop_duplicates(subset=['Title'],inplace=True)

In [None]:
table = table[table.Text.map(len)>500]

In [None]:
# confirming
table[table.Text.map(len)<500]

In [None]:
table['Table Table'] = table['Table Table'].map(str)

table.Text = table.Text.str.strip()
table['Table Table'] = table['Table Table'].str.strip()

def initial_clean(text):
    """
    Function to clean text of websites, email addresess and any punctuation
    We also lower case the text
    """
    pattern = r"((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)"
    text = re.sub('[[a-zA-Z0-9]+]|[\n[\nedit\n]+]',' ',text)
    text = re.sub(pattern," ",text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    return text

stop_words = STOPWORDS

def remove_stop_words(text):
    '''
    Function to remove stopwords from text
    '''
    return [word for word in text if word not in stop_words]

def lemmatize(text):
    '''
    Function to perform lemmatization on text
    '''
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in text if len(word)>2]
    lemmed = [WordNetLemmatizer().lemmatize(word,pos='v') for word in lemmed]
    return lemmed

def apply_all(text):
    return lemmatize(remove_stop_words(initial_clean(text)))

In [None]:
# clean text and title and create new column "tokenized"
t1 = time.time()
table['tokenized'] = table['Text'].apply(apply_all) + table['Table Table'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(table), "articles:", (t2-t1)/60, "min")

In [None]:
# list of all words
all_words = [word for item in list(table['tokenized']) for word in item]
# use nltk fdist to get a frequency distribution of all words
fdist = FreqDist(all_words)
len(fdist) # number of unique words

In [None]:
import pickle

pickle.dump(fdist,open("fdist_file2.pkl","wb"))

fdist

In [None]:
k = 100000
top_k_words = fdist.most_common(k)
top_k_words[-10:]

In [None]:
# function to keep only top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)

def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]

table['tokenized'] = table['tokenized'].apply(keep_top_k_words)

# document length
table['doc_len'] = table['tokenized'].apply(lambda x: len(x))
doc_lengths = list(table['doc_len'])
table.drop(labels='doc_len', axis=1, inplace=True)

print("length of list:",len(doc_lengths),
      "\naverage document length", np.average(doc_lengths),
      "\nminimum document length", min(doc_lengths),
      "\nmaximum document length", max(doc_lengths))

In [None]:
corpus = list(table['tokenized'])

pickle.dump(corpus,open("wiki_corpus_file.pkl","wb"))

In [None]:
wiki_title_file = list(table["Title"])
wiki_text_file = list(table["Text"])

wiki_title_file[10]

'Severe acute respiratory syndrome coronavirus 2'

pickle.dump(wiki_title_file,open("wiki_title_file.pkl","wb"))
pickle.dump(wiki_text_file,open("wiki_text_file.pkl","wb"))

In [None]:
table = pd.read_csv("wikipedia_articles2.csv")

table.head()

In [None]:
# table[table.duplicated(subset=['Title','Text'])]
table.drop_duplicates(subset=['Title','Text'],inplace=True)


# table[table.Text.map(len)<400]
# print(table.Text[3642])
print(table.URL[4759])
print(table.Text[8851])

In [None]:
print(table.Text[0].strip())

In [None]:
table['Table Text'] = table['Table Text'].map(str)
table['Table Text'] = table['Table Text'].str.strip()

table.Text = table.Text.str.strip()
table.Text = table.Text.map(str)

In [None]:
def initial_clean(text):
    """
    Function to clean text which can contain Links, email addresess, and any punctuation
    We also lower case the text
    """
    pattern = r"((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)"
    text = re.sub(pattern," ",text)
    text = re.sub('[[a-zA-Z0-9]+]|[\n[\nedit\n]+]',' ',text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = text.lower()
    text = nltk.word_tokenize(text)
    return text

stop_words = STOPWORDS

def remove_stop_words(text):
    '''
    Function to remove stopwords from text
    '''
    return [word for word in text if word not in stop_words]

def lemmatize(text):
    '''
    Function to perform lemmatization on text
    '''
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in text if len(word)>2]
    lemmed = [WordNetLemmatizer().lemmatize(word,pos='v') for word in lemmed]
    return lemmed

def apply_all(text):
    return lemmatize(remove_stop_words(initial_clean(text)))

In [None]:
# clean text and title and create new column "tokenized"
t1 = time.time()
table['tokenized'] = table['Text'].apply(apply_all) + table['Table Table'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(table), "articles:", (t2-t1)/60, "min")