# Text Similarity Techniques

In [1]:
# Reading the dataset
from keras.datasets import imdb

In [2]:
# Train-Test Split
(x_trg, y_trg), (x_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
# Importing sequences
from keras.preprocessing import sequence

In [4]:
# Converting sequences
x_trg = sequence.pad_sequences(x_trg, maxlen = 200)
x_test = sequence.pad_sequences(x_test, maxlen = 200)

print("Training data: ", x_trg.shape)
print("Test data: ", x_test.shape)

Training data:  (25000, 200)
Test data:  (25000, 200)


#### Cosine Similarity

In [5]:
# Recommendation system using cosine similarity
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
cosine_sim = cosine_similarity(x_trg)
cosine_df = pd.DataFrame(cosine_sim)
cosine_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24990,24991,24992,24993,24994,24995,24996,24997,24998,24999
0,1.0,0.07267,0.154264,0.043826,0.065885,0.046102,0.019121,0.040435,0.060737,0.085905,...,0.019745,0.016488,0.423707,0.076214,0.021493,0.119296,0.048075,0.04267,0.028654,0.048103
1,0.07267,1.0,0.029624,0.036726,0.045839,0.018293,0.116955,0.033856,0.077837,0.176688,...,0.070657,0.029367,0.108633,0.093391,0.086931,0.032916,0.034615,0.110197,0.09666,0.185486
2,0.154264,0.029624,1.0,0.042385,0.031119,0.655557,0.066579,0.02485,0.014051,0.042929,...,0.064521,0.091506,0.094412,0.012577,0.016174,0.032766,0.016008,0.024737,0.047395,0.021324
3,0.043826,0.036726,0.042385,1.0,0.030001,0.009758,0.165981,0.097744,0.055563,0.048086,...,0.101616,0.00902,0.079001,0.149964,0.038746,0.118559,0.120469,0.056657,0.107109,0.060854
4,0.065885,0.045839,0.031119,0.030001,1.0,0.041035,0.043895,0.051612,0.04058,0.090429,...,0.037689,0.060047,0.080045,0.095757,0.030274,0.083611,0.015712,0.047592,0.057869,0.039025


In [8]:
cosine_sim_movies = cosine_df.iloc[1].values
print("Cosine - Movie Similarities: \n", cosine_sim_movies)

Cosine - Movie Similarities: 
 [0.07266999 1.         0.02962361 ... 0.11019704 0.09665961 0.18548562]


In [9]:
# Display index of first 5 similar movies with movie index 1
cosine_sim_index = np.argsort(-cosine_sim_movies)[0:5]
print("Cosine - Index of similar movies: \n", cosine_sim_index)

Cosine - Index of similar movies: 
 [    1 22259 19517  7491 18171]


#### Euclidean Distance

In [10]:
# Import necessary packages
from sklearn.metrics.pairwise import euclidean_distances

In [11]:
euclidean_sim = euclidean_distances(x_trg)
euclidean_df = pd.DataFrame(euclidean_sim)
euclidean_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24990,24991,24992,24993,24994,24995,24996,24997,24998,24999
0,0.0,90393.905198,80616.01496,124616.442603,57948.740409,109862.495862,152179.936802,85126.064957,77638.409811,57531.174853,...,55842.033434,137153.877189,74563.655584,54079.994952,117772.660329,86932.868019,125525.033687,140375.841786,81000.662084,133648.562742
1,90393.905198,0.0,101780.97081,135844.363888,79949.045535,123744.012429,154621.905201,101095.68704,93468.346626,76645.149338,...,77484.536593,146613.302967,104253.692102,76679.776793,125425.507509,105693.110755,137004.831736,145280.549428,94128.765949,134074.210041
2,80616.01496,101780.97081,0.0,131531.536914,73037.192108,74466.971544,155040.537218,95982.417301,90578.004085,72801.615106,...,70057.816409,139042.939562,100146.881279,70077.11461,125627.921065,100410.483566,134173.332388,147520.824994,90824.375208,141571.890278
3,124616.442603,135844.363888,131531.536914,0.0,117026.988725,150559.35312,168265.881931,127827.663102,126442.713531,116595.796824,...,114420.388065,170602.524205,134359.132101,113687.452962,153308.069155,129626.245386,153208.055177,170066.395323,124705.252027,164592.080277
4,57948.740409,79949.045535,73037.192108,117026.988725,0.0,100756.131208,144676.731495,71868.286149,63995.141464,35117.477287,...,30441.249005,128238.48769,81457.868883,27761.681775,108413.786134,77441.548635,118597.865086,133373.084953,66112.315335,126722.440799


In [12]:
euclidean_sim_movies = euclidean_df.iloc[10].values
print("Euclidean - Movie Similarities: \n", euclidean_sim_movies)

Euclidean - Movie Similarities: 
 [ 66609.50129674  85070.30586521  80360.113371   ... 134812.86823594
  74133.41518506 130115.1228797 ]


In [13]:
# Display index of 5 similar movies with movie index 10
euclidean_sim_index = np.argsort(euclidean_sim_movies)[0:5]
print("Euclidean - Index of similar movies: \n", euclidean_sim_index)

Euclidean - Index of similar movies: 
 [   10 24142 19348  6910    51]


#### Manhattan Distance

In [14]:
# Import the necessary package
from sklearn.metrics.pairwise import manhattan_distances

In [15]:
manhattan_sim = manhattan_distances(x_trg)
manhattan_df = pd.DataFrame(manhattan_sim)
manhattan_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24990,24991,24992,24993,24994,24995,24996,24997,24998,24999
0,0.0,473053.0,312022.0,751257.0,266809.0,354001.0,646067.0,423373.0,360344.0,260728.0,...,238832.0,525220.0,448514.0,244612.0,604617.0,443122.0,556214.0,672255.0,378240.0,573529.0
1,473053.0,0.0,441293.0,864882.0,371808.0,470592.0,671178.0,532436.0,450475.0,344707.0,...,348909.0,596943.0,586919.0,333023.0,674670.0,586061.0,684365.0,717524.0,462227.0,588994.0
2,312022.0,441293.0,0.0,708417.0,226587.0,216275.0,573251.0,400905.0,347946.0,232100.0,...,188442.0,450088.0,463544.0,201370.0,563277.0,443686.0,536402.0,641917.0,333104.0,550897.0
3,751257.0,864882.0,708417.0,0.0,670924.0,758790.0,892974.0,760708.0,748611.0,665727.0,...,600217.0,919827.0,832933.0,615791.0,963240.0,790467.0,861695.0,1004304.0,732759.0,953638.0
4,266809.0,371808.0,226587.0,670924.0,0.0,247320.0,521456.0,321150.0,266615.0,159941.0,...,129613.0,399783.0,412291.0,121619.0,499596.0,375501.0,487219.0,568768.0,263601.0,473014.0


In [16]:
manhattan_sim_movies = manhattan_df.iloc[305].values
print("Manhattan - Movie Similarities: \n", manhattan_sim_movies)

Manhattan - Movie Similarities: 
 [578862. 659769. 547694. ... 853833. 581902. 755761.]


In [17]:
# Display index of first 5 similar movies with movie index 305
manhattan_sim_index = np.argsort(manhattan_sim_movies)[0:5]
print("Manhattan - Index of similar movies: \n", manhattan_sim_index)

Manhattan - Index of similar movies: 
 [  305 11156 13051   430  8934]
