In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/text-similarity/Phrases 2.txt
/kaggle/input/phrases-for-calculating-similarity/Phrases.txt


# Measuring similarity between phrases 
Dataset used: as group of 15 phrases in a txt file

In [4]:
# importing needed libraries 
import nltk

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

## Step 1: loading the data

In [5]:
def load_data(path):
    with open(path, 'r') as file:
        text = file.read()
        file.close()
    return text

Now we use the method `load_data` to read the txt file containing the data

In [6]:
data = load_data('/kaggle/input/text-similarity/Phrases 2.txt')
print(data)

The cat sat on the mat.
A small cat was sitting on the mat.
The dog barked at the mailman.
The quick brown fox jumps over the lazy dog.
A swift red fox leapt over the sleeping dog.
Birds are flying high in the sky.
Fish swim gracefully in the water.
The sun rises in the east every morning.
Every morning, the sun rises from the east.
The programmer wrote code all night long.


## Step 2: Preprocessing
we first need to split the text into phrases, then apply preprocessing steps to make the similarity calculations more accurate: 
- Removing punctuation 
- Stemming 
- Removing stop words

In [7]:
# split into lines/ phrases
phrases = data.split('\n')
phrases

['The cat sat on the mat.',
 'A small cat was sitting on the mat.',
 'The dog barked at the mailman.',
 'The quick brown fox jumps over the lazy dog.',
 'A swift red fox leapt over the sleeping dog.',
 'Birds are flying high in the sky.',
 'Fish swim gracefully in the water.',
 'The sun rises in the east every morning.',
 'Every morning, the sun rises from the east.',
 'The programmer wrote code all night long.']

In [10]:
import string
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
def preprocessing(phrase):
    # remove punctuation 
    clean = phrase.translate(str.maketrans('', '', string.punctuation))
    # stemming the words and Removing stopwords
    # define ste of stopwords
    stop_words = set(stopwords.words('english'))
    # initialize stemmer
    stemmer = PorterStemmer()
    clean = [stemmer.stem(w) for w in clean.split() if w not in stop_words]
    
    return " ".join(clean)

using the `preprocess()` method on the data

In [11]:
clean_phrases = [preprocessing(phrase) for phrase in phrases]
print(clean_phrases)

['the cat sat mat', 'A small cat sit mat', 'the dog bark mailman', 'the quick brown fox jump lazi dog', 'A swift red fox leapt sleep dog', 'bird fli high sky', 'fish swim grace water', 'the sun rise east everi morn', 'everi morn sun rise east', 'the programm wrote code night long']


## Step 3: Calculating sentence similarity
We will use the **cosine similarity** and pass it **TF-IDF** vectors  
### First: 
we vectorize the text using `TfidVectorizer` in the scikit learn library 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(phrases)
print(features)

  (0, 23)	0.4410638910316001
  (0, 26)	0.4410638910316001
  (0, 32)	0.5188429288856043
  (0, 6)	0.4410638910316001
  (0, 40)	0.3836534207192764
  (1, 33)	0.4341814502339401
  (1, 41)	0.4341814502339401
  (1, 36)	0.4341814502339401
  (1, 23)	0.36909389950681465
  (1, 26)	0.36909389950681465
  (1, 6)	0.36909389950681465
  (1, 40)	0.16052565171591096
  (2, 22)	0.4938704809297742
  (2, 2)	0.4938704809297742
  (2, 3)	0.4938704809297742
  (2, 8)	0.36730603953111063
  (2, 40)	0.3651877839174674
  (3, 19)	0.3908750940103214
  (3, 27)	0.33227954024896456
  (3, 18)	0.3908750940103214
  (3, 13)	0.33227954024896456
  (3, 5)	0.3908750940103214
  (3, 29)	0.3908750940103214
  (3, 8)	0.29070533323229075
  (3, 40)	0.28902883424299725
  :	:
  (6, 39)	0.4617656341185426
  (6, 11)	0.4617656341185426
  (6, 17)	0.3434287182751693
  (6, 40)	0.17072408164132932
  (7, 24)	0.39156916541155534
  (7, 10)	0.39156916541155534
  (7, 9)	0.39156916541155534
  (7, 31)	0.39156916541155534
  (7, 37)	0.39156916541155534
 

### Second we calculate the cosine similarity 
by passing the tf-idf vectors to the `cosine_similarity` method in the scikit learn library 

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
# calculate  the cosine similarity
similarity_matrix = cosine_similarity(features)
# normalize the similarity matrix (min-max normalization) 
normalized_similarities = (similarity_matrix - np.min(similarity_matrix))/(np.max(similarity_matrix)-np.min(similarity_matrix))

## Step 4: Mapping the similarities to colors
we will use the `matplotlib` colormaps to map each similarity score (between 0 and 1), using the map `YlGn`

In [14]:
import matplotlib.pyplot as plt
def map_color(score):
    cmap = plt.get_cmap("viridis")
    return cmap(score)

now we use the created method to convert the similarity scores to **RGB** colors

In [15]:
colors = [map_color(score) for score in normalized_similarities.flatten()]
colors

[(0.993248, 0.906157, 0.143936, 1.0),
 (0.119738, 0.603785, 0.5414, 1.0),
 (0.280255, 0.165693, 0.476498, 1.0),
 (0.283072, 0.130895, 0.449241, 1.0),
 (0.277941, 0.056324, 0.381191, 1.0),
 (0.277941, 0.056324, 0.381191, 1.0),
 (0.279566, 0.067836, 0.391917, 1.0),
 (0.281412, 0.155834, 0.469201, 1.0),
 (0.28229, 0.145912, 0.46151, 1.0),
 (0.277941, 0.056324, 0.381191, 1.0),
 (0.119738, 0.603785, 0.5414, 1.0),
 (0.993248, 0.906157, 0.143936, 1.0),
 (0.277941, 0.056324, 0.381191, 1.0),
 (0.274952, 0.037752, 0.364543, 1.0),
 (0.267004, 0.004874, 0.329415, 1.0),
 (0.267004, 0.004874, 0.329415, 1.0),
 (0.26851, 0.009605, 0.335427, 1.0),
 (0.277018, 0.050344, 0.375715, 1.0),
 (0.276022, 0.044167, 0.370164, 1.0),
 (0.267004, 0.004874, 0.329415, 1.0),
 (0.280255, 0.165693, 0.476498, 1.0),
 (0.277941, 0.056324, 0.381191, 1.0),
 (0.993248, 0.906157, 0.143936, 1.0),
 (0.257322, 0.25613, 0.526563, 1.0),
 (0.274128, 0.199721, 0.498911, 1.0),
 (0.277018, 0.050344, 0.375715, 1.0),
 (0.278791, 0.062145

## Step 5: Generate html code 
using the **BeautifulSoup** library

In [16]:
from bs4 import BeautifulSoup 
import matplotlib 

# instantiate a BeautifulSoup object to create html code 
bsoup = BeautifulSoup('<html><body></body></html>', "html.parser")

now we generate html code to style the phrases inside the body

In [17]:
for i,  sentence in enumerate(phrases[:len(phrases)-1]):
    # create new <span> tag inside body
    span = bsoup.new_tag('span')
    # convert rgb color to hexacolor 
    color = matplotlib.colors.rgb2hex(colors[i])
    # style the span element with the converted color 
    span['style'] = f'background-color: {color}'  # CSS code
    # set the span element to the sentence itself 
    span.string = sentence
    # add the span element to the body of the html
    bsoup.body.append(span)
    # add a break line tag to write the next sentence
    bsoup.body.append(bsoup.new_tag("br"))

now we prettify the html code to make it indented and humanly readable

In [18]:
output = bsoup.prettify()
print(output)

<html>
 <body>
  <span style="background-color: #fde725">
   The cat sat on the mat.
  </span>
  <br/>
  <span style="background-color: #1f9a8a">
   A small cat was sitting on the mat.
  </span>
  <br/>
  <span style="background-color: #472a7a">
   The dog barked at the mailman.
  </span>
  <br/>
  <span style="background-color: #482173">
   The quick brown fox jumps over the lazy dog.
  </span>
  <br/>
  <span style="background-color: #470e61">
   A swift red fox leapt over the sleeping dog.
  </span>
  <br/>
  <span style="background-color: #470e61">
   Birds are flying high in the sky.
  </span>
  <br/>
  <span style="background-color: #471164">
   Fish swim gracefully in the water.
  </span>
  <br/>
  <span style="background-color: #482878">
   The sun rises in the east every morning.
  </span>
  <br/>
  <span style="background-color: #482576">
   Every morning, the sun rises from the east.
  </span>
  <br/>
 </body>
</html>



Finally we write the generated html code to an **.html** file

In [19]:
with open('viridis 2 output.html', "w", encoding = "utf-8") as file:
    file.write(output)