In [1]:
import pandas as pd
import numpy as np

In [29]:
data = pd.read_csv('Precily_Text_Similarity.csv')
data.shape

(3000, 2)

In [30]:
data.sample(5)

Unnamed: 0,text1,text2
123,eu referendum could cost £80m it could cost £...,2d metal slug offers retro fun like some drill...
2296,stars pay tribute to actor davis hollywood sta...,ethnic producers face barriers minority ethn...
2710,howl helps boost japan s cinemas japan s box o...,jugnot tops french actor league actor gerard...
2506,murray returns to scotland fold euan murray ha...,show over for mtv s the osbournes rock star oz...
1659,fit-again betsen in france squad france have b...,o sullivan quick to hail italians ireland coac...


In [32]:
data['text2'][0]

'gardener wins double in glasgow britain s jason gardener enjoyed a double 60m success in glasgow in his first competitive outing since he won 100m relay gold at the athens olympics.  gardener cruised home ahead of scot nick smith to win the invitational race at the norwich union international. he then recovered from a poor start in the second race to beat swede daniel persson and italy s luca verdecchia. his times of 6.61 and 6.62 seconds were well short of american maurice greene s 60m world record of 6.39secs from 1998.  it s a very hard record to break  but i believe i ve trained very well   said the world indoor champion  who hopes to get closer to the mark this season.  it was important to come out and make sure i got maximum points. my last race was the olympic final and there was a lot of expectation.  this was just what i needed to sharpen up and get some race fitness. i m very excited about the next couple of months.   double olympic champion  marked her first appearance on h

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    
    Args:
      sentence: The sentence we want to process.
    
    Returns:
      A list of processed words
    """
    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]
    
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    
    Args:
      sentence1_emb: sentence1 embedding column
      sentence2_emb: sentence2 embedding column
    
    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

In [5]:
import textdistance

def jac_sim(rows):
    #Text Processing
    sen1 = text_processing(rows['text1'])
    sen2 = text_processing(rows['text2'])
    # jaccard similarity
    return textdistance.jaccard.normalized_similarity(sen1, sen2)

In [6]:
jac_sim(data.loc[0])

0.045454545454545414

In [7]:
data['Jaccard_Similarity'] = data.apply(jac_sim, axis=1)
data

KeyboardInterrupt: 

In [None]:
data.loc[2998]['text1']

In [None]:
data.loc[2998]['text2']

In [10]:
# TFIDF
data_1 = data.drop('Jaccard_Similarity', axis=1)

In [22]:
from sklearn.model_selection import train_test_split

data_1_train, data_1_test = train_test_split(data_1, test_size=0.2)

In [12]:
data_1_train.shape, data_1_test.shape

((2400, 2), (600, 2))

In [23]:
data_1_train.apply(lambda col: col.drop_duplicates().reset_index(drop=True))
data_1_test.apply(lambda col: col.drop_duplicates().reset_index(drop=True))
data_1_test

Unnamed: 0,text1,text2
2528,double win for sea inside spanish movie the se...,strike threat over pension plans millions of p...
2800,day-lewis set for berlin honour actor daniel d...,mandelson warns bbc on campbell the bbc should...
1563,moody joins up with england lewis moody has fl...,us trade gap hits record in 2004 the gap betwe...
1516,playstation 3 chip to be unveiled details of t...,germany calls for eu reform german chancellor ...
1314,o sullivan quick to hail italians ireland coac...,mansfield 0-1 leyton orient an second-half goa...
...,...,...
1710,healey targets england comeback leicester wing...,film star fox behind theatre bid leading actor...
1483,radcliffe proves doubters wrong this won t go ...,thanou bullish over drugs hearing katerina tha...
1125,musical treatment for capra film the classic f...,games firms face tough future uk video game ...
47,japan bank shares up on link talk shares of su...,satellite mapping aids darfur relief aid worke...


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(lowercase=True, stop_words='english')

# Train the model
X_train = pd.concat([data_1_train['text1'], data_1_train['text2']]).unique()
model.fit(X_train)

# Generate Embeddings on Test
sentence1_emb = model.transform(data_1_test['text1'])
sentence2_emb = model.transform(data_1_test['text2'])

# Cosine Similarity
data_1_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(lowercase=True, stop_words='english')

X_train = pd.concat([data_1['text1'], data_1['text2']]).unique()
model.fit(X_train)

sentence1_emb = model.transform(data_1['text1'])
sentence2_emb = model.transform(data_1['text2'])

data_1['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

In [28]:
data_1

Unnamed: 0,text1,text2,TFIDF_cosine_score
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,0.058171
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,0.006190
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,0.006456
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,0.013339
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,0.018365
...,...,...,...
2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...,0.002250
2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...,0.017754
2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...,0.004470
2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...,0.012517


In [26]:
data_1_test

Unnamed: 0,text1,text2,TFIDF_cosine_score
2528,double win for sea inside spanish movie the se...,strike threat over pension plans millions of p...,0.009779
2800,day-lewis set for berlin honour actor daniel d...,mandelson warns bbc on campbell the bbc should...,0.004350
1563,moody joins up with england lewis moody has fl...,us trade gap hits record in 2004 the gap betwe...,0.008041
1516,playstation 3 chip to be unveiled details of t...,germany calls for eu reform german chancellor ...,0.014892
1314,o sullivan quick to hail italians ireland coac...,mansfield 0-1 leyton orient an second-half goa...,0.069606
...,...,...,...
1710,healey targets england comeback leicester wing...,film star fox behind theatre bid leading actor...,0.014793
1483,radcliffe proves doubters wrong this won t go ...,thanou bullish over drugs hearing katerina tha...,0.050090
1125,musical treatment for capra film the classic f...,games firms face tough future uk video game ...,0.016638
47,japan bank shares up on link talk shares of su...,satellite mapping aids darfur relief aid worke...,0.025065


In [8]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') # if necessary...


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]


print(cosine_sim('a little bird', 'a little bird'))
print(cosine_sim('a little bird', 'a little bird chirps'))
print(cosine_sim('a little bird', 'a big dog barks'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ajmer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0.9999999999999998
0.7092972666062738
0.0




In [9]:
for i in range(10):
    print(cosine_sim(data.loc[i]['text1'], data.loc[i]['text2']))

0.08894445518721168
0.06509250367292911
0.05645131454403604
0.04902980192520687
0.09583909566940436
0.09962980334371284
0.033419765407882496
0.09145585435809597
0.0677872982691842
0.0566522737931877


In [10]:
data_2 = data.iloc[:, :2]
data_2

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...
...,...,...
2995,uk directors guild nominees named martin scors...,steel firm to cut 45 000 jobs mittal steel ...
2996,u2 to play at grammy awards show irish rock ba...,israel looks to us for bank chief israel has a...
2997,pountney handed ban and fine northampton coach...,india and iran in gas export deal india has si...
2998,belle named best scottish band belle & sebas...,mido makes third apology ahmed mido hossam h...


In [11]:
similarities = []
for i in range(data_2.shape[0]):
    similarity_score = cosine_sim(data['text1'][i], data['text2'][i])
    similarities.append(similarity_score)

data_2['Cosine_sim'] = similarities
data_2.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['Cosine_sim'] = similarities


Unnamed: 0,text1,text2,Cosine_sim
917,eminem beats elvis to number one rapper eminem...,sayeed to stand down as tory mp tory mp jonath...,0.065849
1405,vera drake scoops film award oscar hopefuls mi...,firefox browser takes on microsoft microsoft s...,0.048046
864,e-university disgraceful waste a failed gove...,south bank awards honour hit soap coronation s...,0.018558
2062,wenger steps up row arsene wenger has stepped ...,labour accused of broken pledge labour has alr...,0.093862
2225,chancellor rallies labour voters gordon brown ...,federer claims dubai crown world number one ro...,0.077264


In [28]:
cosine_sim('I am Siddharth Agarwal', 'My name is Siddharth Agarwal')

0.9999999999999998

In [12]:
similarity = []
for i in similarities:
    if i >= 0.05:
        i = 1
        similarity.append(i)
    else:
        i = 0
        similarity.append(i)
print(similarity)

[1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 

In [None]:
cosine_sim(data['text1'][i], data['text2'][i])

In [13]:
data_2['sim_score'] = similarity
data_2.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['sim_score'] = similarity


Unnamed: 0,text1,text2,Cosine_sim,sim_score
653,labour attacked on howard poster labour has be...,radcliffe tackles marathon tasks paula radclif...,0.073785,1
366,apple unveils low-cost mac mini apple has un...,celebrities get to stay in jungle all four con...,0.019766,0
2613,sainsbury s labour election gift science minis...,amex shares up on spin-off news shares in amer...,0.094634,1
51,spam e-mails tempt net shoppers computer users...,call to save manufacturing jobs the trades uni...,0.08592,1
1344,ten-year tragedy of missing manic richey edwar...,ebbers denies worldcom fraud former worldcom c...,0.046223,0


In [14]:
data_2.drop('Cosine_sim', axis=1, inplace=True)
data_2.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2.drop('Cosine_sim', axis=1, inplace=True)


Unnamed: 0,text1,text2,sim_score
349,tarantino to direct csi episode film director ...,mcilroy wins 800m indoor title james mcilroy m...,1
1882,liberian economy starts to grow the liberian e...,chancellor rallies labour voters gordon brown ...,1
546,saab to build cadillacs in sweden general moto...,fiat chief takes steering wheel the chief exec...,1
345,bank set to leave rates on hold uk interest ra...,france v wales (sat) stade de france paris s...,1
950,navratilova hits out at critics martina navrat...,bargain calls widen softbank loss japanese com...,0


In [15]:
data_2.loc[590]['text2']



In [16]:
X = data_2.iloc[:,:-1]
y = data_2['sim_score']

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(2400, 2) (600, 2)


In [18]:
y_train.shape, y_test.shape

((2400,), (600,))

## Checkpoint 1

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [20]:
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None)

In [21]:
data_2.sim_score.value_counts()

1    2030
0     970
Name: sim_score, dtype: int64

In [22]:
max_vocab_length = 10000
max_length = 15 
print(round(sum([len(i.split()) for i in X_train['text1']])/len(X_train['text1'])))
print(round(sum([len(i.split()) for i in X_train['text2']])/len(X_train['text2'])))

392
399


In [23]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=399, # how long is each input
                             name="embedding_1") 

In [80]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

ValueError: Exception encountered when calling layer "text_vectorization" (type TextVectorization).

When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 1, 2) with rank=3

Call arguments received by layer "text_vectorization" (type TextVectorization):
  • inputs=tf.Tensor(shape=(None, 1, 2), dtype=string)

In [78]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [79]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         1280000   
                                                                 
 global_average_pooling1d_1   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [75]:
model_1_history = model_1.fit(X_train, y_train,epochs=5)

Epoch 1/5


ValueError: in user code:

    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Ajmer\anaconda3\envs\semantic_similarity\lib\site-packages\keras\layers\preprocessing\text_vectorization.py", line 564, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer "text_vectorization" "                 f"(type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(32, 2) with rank=2
    
    Call arguments received by layer "text_vectorization" "                 f"(type TextVectorization):
      • inputs=tf.Tensor(shape=(32, 2), dtype=string)
