In [None]:
import pandas as pd
import textdistance
import json
import re
import unicodedata
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer

stanza.download('en') 
nlp = stanza.Pipeline('en') 

In [None]:
# 0 <= acceptance <=1 
#the higher ts is the more similar attr and freqPh
def text_similarity(similarityObject, attributes, frequentPhrases, acceptance):
    results = []
    for attr in attributes:
        for freqPh in frequentPhrases:
            ts = similarityObject.normalized_similarity(attr, freqPh)
            if(ts > acceptance):
                results.append((attr, freqPh, ts))
    return results
                
def ngrams(input, maxLength):
    input = input.split(' ')
    output = []
    for n in range(1, maxLength+1):
        for i in range(len(input)-n+1):
            output.append(input[i:i+n])
    return [' '.join(word) for word in output]

In [None]:
investment_attr = ["availability","expected return","factors affecting returns","form financing","intermediary platform","regulatory environment","risk","portfolio diversification","time"]
json_file = r'securities.json' #set the file path of the security descriptions 

regexes = [
    re.compile(r" +"),
    re.compile(r"</?\w[^>]*>"),
    re.compile(r"<.*>")
]

#load the data from the json file into an array of Investment objects
with open(json_file) as json_data:
    data = json.load(json_data)
    
class Investment:
    def __init__(self, title, description):
        self.title = title
        self.description = description
        
investments = []
    
for investment in data:    
    inv = Investment(investment['title'], investment['description'])
    investments.append(inv)

In [None]:
#pre-process the investment descriptions
for investment in investments:
    #remove html
    investment.description = re.sub(regexes[1], ' ', ' '.join(investment.description))
    investment.description = re.sub(regexes[2], ' ', investment.description)
    #filtering control characters
    investment.description = ''.join(ch for ch in investment.description if unicodedata.category(ch)[0]!="C")
    #remove multiple whitespaces
    investment.description = re.sub(regexes[0], ' ', investment.description)

In [None]:
for i, investment in enumerate(investments):
    clearDescription= []
    nlpResult = nlp(investment.description)
    for sentence in nlpResult.sentences:
        cleanSentence = []
        for word in sentence.words:
            if(word.upos != "PUNCT"):
                cleanSentence.append(word.lemma) #here change word.text to word.lemma for the corresponding and vice versa.
        clearDescription.append(" ".join(word for word in cleanSentence))
    investment.description = clearDescription

In [None]:
#create lemmas from investment attributes from the ontology
investment_attr_lemmas = []
for attr in investment_attr:
    nlpResult = nlp(attr)
    for sentence in nlpResult.sentences:
        cleanSentence = []
        for word in sentence.words:
            cleanSentence.append(word.lemma)
        investment_attr_lemmas.append(" ".join(word for word in cleanSentence))
print(investment_attr_lemmas)
    

In [None]:
#for Token based methods, for splitting sentences by words and not q-grams use qval=None as a constructur parameter e.g. Cosine(qval=None)
#read the manual: https://github.com/life4/textdistance
result = []
for investment in investments:
    for sentence in investment.description:
        similarityResult = text_similarity(textdistance.RatcliffObershelp(), investment_attr_lemmas, ngrams(sentence, 4), 0.59) 
        if(similarityResult):
            for similarities in similarityResult:
                result.append([investment.title, sentence, similarities[0], similarities[1], similarities[2]])
similarities = pd.DataFrame(result, columns=['investment title', 'surrounding sentence', 'ontology attribute', 'similar pair', 'RatcliffObershelp'])
similarities.to_csv('attr_instrument_similarities_RatcliffObershelp_059.csv')

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

tfidf_corpus = []
for investment in investments:
   tfidf_corpus.append(' '.join(sentence for sentence in investment.description))

vectors = vectorizer.fit_transform(tfidf_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names, index=[investment.title for investment in investments])
dfFiltered = df[df.columns[(df >= 0.25).any()]]
dfFiltered.to_csv('instruments_tfidf_vectors_025threshold.csv', sep='|')