In [71]:
class Tokenizer:
    def __init__(self, text, RemoveStopWords=True):
        self.text = text
        self.RemoveStopWords = RemoveStopWords
        self.punctuations = ['.','!', '?', '\'', '\"', ',', ':', ';', '(', ')', '[', ']', '<', '>', '\n']
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
              'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
              'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
              'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
              'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 
              'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
              'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
              'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 
              'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 
              'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
              'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should',
              'now', 'm', 're', 'would', 'd', 'll']
        self.tokens = self.preprocess()
    
    def preprocess(self):
        text = self.text.lower()
        text = text.replace('-', '')
        for p in self.punctuations:
            text = text.replace(p, ' ')
        
        if self.RemoveStopWords:
            tokens = [w for w in text.split(' ') if w not in self.stop_words and w!='']
        else:
            tokens = [w for w in text.split(' ') if w!='']

        return tokens
    

class Vectorizer:
    def __init__(self, RemoveStopWords=True):
        self.RemoveStopWords = RemoveStopWords
        self.vocabulary_ = {}
        self.dictionary_ = {}
        self.cv = []
        self.tfidf = []
    
    def fit(self, docs):
        self.preprocess_docs = [Tokenizer(doc, RemoveStopWords=self.RemoveStopWords).tokens for doc in docs]
        
        for doc in self.preprocess_docs:
            for word in doc:
                self.vocabulary_[word] = self.vocabulary_.get(word, 0) + 1
        
        self.dictionary_ = {w:i for i,w in enumerate(self.vocabulary_)}
        self.total_words = len(self.vocabulary_)
        self.total_docs = len(docs)
        return self
    
    def get_CountVectorizer(self):
        for doc in self.preprocess_docs:
            cv = [0]*self.total_words
            for w in doc:
                j = self.dictionary_[w]
                cv[j] += 1
            self.cv.append(cv)
        return self.cv
        
    def get_TFIDF(self):
        self.cv = self.get_CountVectorizer()
        for i in range(self.total_docs):
            tfidf = [0]*self.total_words
            for word in self.preprocess_docs[i]:
                j = self.dictionary_[word]
                tfidf[j] += self.cv[i][j]/self.vocabulary_[word]
            self.tfidf.append(tfidf)
        return self.tfidf

    
class TextSimilarity:
    def __init__(self, docs, RemoveStopWords=True):
        self.docs = docs
        self.RemoveStopWords = RemoveStopWords
    
    def get_cosin_similarity(self, v1, v2):
        # cos_sim = dot(a, b)/(norm(a)*norm(b))
        v1_dot_v2 = 0
        norm_v1 = 0
        norm_v2 = 0
        
        for i in range(len(v1)):
            v1_dot_v2 += v1[i]*v2[i]
            norm_v1 += v1[i]**2
            norm_v2 += v2[i]**2
        
        return v1_dot_v2/(norm_v1*norm_v2)**0.5
        
    def get_pairwise_similarity(self, v1, v2):
        sim = 0
        for i in range(len(v1)):
            sim += v1[i]*v2[i]
        return sim
    
    def get_similarities(self):
        countVectorizer = Vectorizer(RemoveStopWords=self.RemoveStopWords).fit(self.docs)
        count_vec = countVectorizer.get_CountVectorizer()
        tfidf_vec = countVectorizer.get_TFIDF()
        
        cosin_sims = {}
        pairwise_sims = {}
        for i in range(countVectorizer.total_docs):
            for j in range(i+1, countVectorizer.total_docs):
                cosin_sims[f'Text{i}-Text{j}'] = self.get_cosin_similarity(count_vec[i], count_vec[j])
                pairwise_sims[f'Text{i}-Text{j}'] = self.get_pairwise_similarity(tfidf_vec[i], tfidf_vec[j])
                
        
        return cosin_sims, pairwise_sims
    
                
        
    
    
        

In [76]:
text3 = """We are always looking for opportunities for you to earn more points, 
which is why we also give you a selection of Special Offers. These Special Offers are 
opportunities to earn bonus points on top of the regular points you earn every time you purchase a 
participating brand. No need to pre-select these offers, we'll give you the points whether or not 
you knew about the offer. We just think it is easier that way.

"""

text2 = """The easiest way to earn points with Fetch Rewards is to just shop for the items you already buy. 
If you have any eligible brands on your receipt, you will get points based on the total cost of the products. 
You do not need to cut out any coupons or scan individual UPCs. Just scan your receipt after you check out 
and we will find the savings for you.

"""

text1 = """The easiest way to earn points with Fetch Rewards is to just shop for the products you already love. 
If you have any participating brands on your receipt, you'll get points based on the cost of the products. 
You don't need to clip any coupons or scan individual barcodes. Just scan each grocery receipt after you shop 
and we'll find the savings for you.

"""

docs = [text1, text2, text3]

In [77]:
cos_sim, pair_sim = TextSimilarity(docs, RemoveStopWords=True).get_similarities()

In [79]:
pair_sim

{'Text0-Text1': 6.401111111111111,
 'Text0-Text2': 1.8322222222222222,
 'Text1-Text2': 1.5822222222222222}

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["I'd like an apple", 
          "An apple a day keeps the doctor away", 
          "Never compare an apple to an orange", 
          "I prefer scikit-learn to Orange", 
          "The scikit-learn docs are Orange and Blue"]
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf*tfidf.T

In [61]:
tfidf.shape

(5, 13)

In [60]:
print(tfidf)

  (0, 9)	0.830880748357988
  (0, 0)	0.5564505207186616
  (1, 0)	0.31752680284846835
  (1, 4)	0.4741246485558491
  (1, 7)	0.4741246485558491
  (1, 6)	0.4741246485558491
  (1, 1)	0.4741246485558491
  (2, 0)	0.48624041659157047
  (2, 3)	0.7260444301457811
  (2, 10)	0.48624041659157047
  (3, 10)	0.40382592962643526
  (3, 11)	0.6029847724484662
  (3, 12)	0.4864843177105593
  (3, 8)	0.4864843177105593
  (4, 10)	0.345821664219199
  (4, 12)	0.4166072657167828
  (4, 8)	0.4166072657167828
  (4, 5)	0.5163739676148649
  (4, 2)	0.5163739676148649


In [59]:
print(pairwise_similarity)


  (0, 2)	0.27056873300683837
  (0, 1)	0.17668795478716204
  (0, 0)	0.9999999999999998
  (1, 2)	0.1543943648960287
  (1, 1)	0.9999999999999999
  (1, 0)	0.17668795478716204
  (2, 4)	0.16815247007633352
  (2, 3)	0.1963564882520361
  (2, 2)	1.0
  (2, 1)	0.1543943648960287
  (2, 0)	0.27056873300683837
  (3, 4)	0.5449975578692605
  (3, 3)	0.9999999999999999
  (3, 2)	0.1963564882520361
  (4, 4)	0.9999999999999996
  (4, 3)	0.5449975578692605
  (4, 2)	0.16815247007633352


In [38]:
sims = TextSimilarity(docs, use_tfidf=False, RemoveStopWords=False).get_similarities()

In [82]:
bool('')

False

In [109]:
class Tokenizer:
    def __init__(self, text, RemoveStopWords=True):
        self.text = text
        self.RemoveStopWords = RemoveStopWords
        self.punctuations = ['.', '!', '?', '\'', '\"',
                             ',', ':', ';', '(', ')', '[', ']', '<', '>', '\n']
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                           'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                           'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
                           'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
                           'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
                           'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for',
                           'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
                           'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
                           'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                           'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
                           'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should',
                           'now', 'm', 're', 'would', 'd', 'll']
        self.tokens = self.preprocess()

    def preprocess(self):
        text = self.text.lower()
        text = text.replace('-', '')
        for p in self.punctuations:
            text = text.replace(p, ' ')

        words = [w for w in text.split(' ') if w != '']
        self.total_words = len(words)
        self.stopWords = len([w for w in words if w in self.stop_words])
        
        if self.RemoveStopWords:
            tokens = [word for word in words if word not in self.stop_words]
        else:
            tokens = words

        return tokens


class Vectorizer:
    def __init__(self, RemoveStopWords=True):
        self.RemoveStopWords = RemoveStopWords
        self.vocabulary_ = {}
        self.dictionary_ = {}
        self.cv = []
        self.tfidf = []

    def fit(self, docs):
        self.preprocess_docs = [
            Tokenizer(doc, RemoveStopWords=self.RemoveStopWords).tokens for doc in docs]

        for doc in self.preprocess_docs:
            for word in doc:
                self.vocabulary_[word] = self.vocabulary_.get(word, 0) + 1

        self.dictionary_ = {w: i for i, w in enumerate(self.vocabulary_)}
        self.total_words = len(self.vocabulary_)
        self.total_docs = len(docs)
        return self

    def get_CountVectorizer(self):
        for doc in self.preprocess_docs:
            cv = [0]*self.total_words
            for w in doc:
                j = self.dictionary_[w]
                cv[j] += 1
            self.cv.append(cv)
        return self.cv

    def get_TFIDF(self):
        self.cv = self.get_CountVectorizer()
        for i in range(self.total_docs):
            tfidf = [0]*self.total_words
            for word in self.preprocess_docs[i]:
                j = self.dictionary_[word]
                tfidf[j] += self.cv[i][j]/self.vocabulary_[word]
            self.tfidf.append(tfidf)
        return self.tfidf


class TextSimilarity:
    def __init__(self, docs, RemoveStopWords=True, use_tfidf=False):
        self.docs = docs
        self.RemoveStopWords = RemoveStopWords
        self.use_tfidf = use_tfidf

    def get_cosin_similarity(self, v1, v2):
        # cos_sim = dot(a, b)/(norm(a)*norm(b))
        v1_dot_v2 = 0
        norm_v1 = 0
        norm_v2 = 0

        for i in range(len(v1)):
            v1_dot_v2 += v1[i]*v2[i]
            norm_v1 += v1[i]**2
            norm_v2 += v2[i]**2

        return v1_dot_v2/(norm_v1*norm_v2)**0.5

    def get_similarities(self):
        countVectorizer = Vectorizer(
            RemoveStopWords=self.RemoveStopWords).fit(self.docs)
        if self.use_tfidf:
            vec = countVectorizer.get_TFIDF()
        else:
            vec = countVectorizer.get_CountVectorizer()

        similarities = {}

        for i in range(countVectorizer.total_docs):
            for j in range(i+1, countVectorizer.total_docs):
                similarities[f'Text{i}-Text{j}'] = self.get_cosin_similarity(
                    vec[i], vec[j])
        return similarities


In [110]:
docs = [text1, text2]
sim = TextSimilarity(docs, RemoveStopWords=True,
                     use_tfidf=False).get_similarities()
print(sim)


{'Text0-Text1': 0.8169217288768313}
