In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
### Load the datasets
df_source1 = pd.read_csv('source_1.csv')
df_source2 = pd.read_csv('source_2.csv')
df_matched_data = pd.read_csv('matched_data.csv')
df_predicted_matches = pd.read_csv('predicted_matches.csv')

In [4]:
### Check the shape of the datasets
print(df_source1.shape)
print(df_source2.shape)

(13238, 2)
(48943, 2)


In [5]:
### Check the datatype of the datasets
print(df_source1.dtypes)
print(df_source2.dtypes)

id       int64
name    object
dtype: object
id       int64
name    object
dtype: object


In [6]:
### Check columns names
print(df_source1.columns)
print(df_source2.columns)

Index(['id', 'name'], dtype='object')
Index(['id', 'name'], dtype='object')


In [7]:
### Check rows of the dataset
print(df_source1.index)
print(df_source2.index)

RangeIndex(start=0, stop=13238, step=1)
RangeIndex(start=0, stop=48943, step=1)


In [8]:
### Set to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
### Display the datasets
df_source1.head(10)

Unnamed: 0,id,name
0,0,"Horses, asses, mules and hinnies; live, pure-b..."
1,1,"Horses; live, pure-bred breeding animals"
2,2,"Horses; live, other than pure-bred breeding an..."
3,4,"Horses, asses, mules and hinnies; live, other ..."
4,5,"Bovine animals; live, pure-bred breeding animals"
5,6,"Cattle; live, pure-bred breeding animals"
6,7,"Cattle; live, other than pure-bred breeding an..."
7,8,"Buffalo; live, pure-bred breeding animals"
8,9,"Buffalo; live, other than pure-bred breeding a..."
9,10,"Bovine animals; live, other than pure-bred bre..."


In [10]:
df_source2.head(10)

Unnamed: 0,id,name
0,0,leveillula lactucae-serriolae
1,1,podosphaera aphanis
2,2,lathyrus czeczottianus
3,3,crocus biflorus subsp. caricus
4,4,hordeum brevisubulatum
5,5,vinca major subsp. major
6,6,geranium psilostemon
7,7,cantharellaceae
8,8,liatris spicata
9,9,potato pulp


#### For comparison purposes

In [11]:
df_matched_data.head(10)

Unnamed: 0,source_1,source_2
0,"Cereals, barley",barley
1,Other fresh or chilled potatoes,potatoes
2,"Fruit, edible; cherries, fresh",cherries
3,"Bran, sharps and other residues, of maize",maize bran
4,"Cereals, millet",millets
5,"Oil seeds; sesamum seeds, whether or not broken",sesame seed
6,Cigarettes containing tobacco,cigarettes
7,"Tomato, fresh or chilled",tomatoes
8,"Pulp, bagasse and other waste of sugar manufac...",beet pulp
9,"Oil seeds; sunflower seeds, whether or not broken",sunflower seed


In [12]:
df_predicted_matches.head(15)

Unnamed: 0,source_1,source_2
0,101,201
1,102,202
2,103,203
3,104,204
4,105,205
5,106,206
6,107,207
7,108,208
8,109,209
9,110,210


In [13]:
df_predicted_matches.shape

(15, 2)

In [14]:
df_source1.head()

Unnamed: 0,id,name
0,0,"Horses, asses, mules and hinnies; live, pure-b..."
1,1,"Horses; live, pure-bred breeding animals"
2,2,"Horses; live, other than pure-bred breeding an..."
3,4,"Horses, asses, mules and hinnies; live, other ..."
4,5,"Bovine animals; live, pure-bred breeding animals"


In [15]:
df_source2.head()

Unnamed: 0,id,name
0,0,leveillula lactucae-serriolae
1,1,podosphaera aphanis
2,2,lathyrus czeczottianus
3,3,crocus biflorus subsp. caricus
4,4,hordeum brevisubulatum


In [16]:
### merge the datasets to form columns for comparison
df_merged = df_source1.merge(df_source2, on=["id"], how="right")

In [17]:
#### Display the datasets
df_merged.head()

Unnamed: 0,id,name_x,name_y
0,0,"Horses, asses, mules and hinnies; live, pure-b...",leveillula lactucae-serriolae
1,0,"Wood in the rough, even peeled, or roughly squ...",leveillula lactucae-serriolae
2,1,"Horses; live, pure-bred breeding animals",podosphaera aphanis
3,1,"Wood in the rough, even peeled, or roughly squ...",podosphaera aphanis
4,2,"Horses; live, other than pure-bred breeding an...",lathyrus czeczottianus


In [18]:
### Shape of the marged dataset
print(df_merged.name_x.shape)
print(df_merged.name_y.shape)

(49884,)
(49884,)


In [19]:
### Check for total nulls in the marged datased

df_merged.isna().sum()

id            0
name_x    36730
name_y        0
dtype: int64

In [20]:
### Drop nulls
df_merged = df_merged.dropna()

In [21]:
### Check if there's nulls in new datdrame
df_merged.isna().sum()

id        0
name_x    0
name_y    0
dtype: int64

In [22]:
### Check the shape of the merged datframe with no nulls
print(df_merged.name_x.shape)
print(df_merged.name_y.shape)

(13154,)
(13154,)


In [23]:
#### Check the values of index 0 of columns name_x before dropping nulls
sent = df_merged.name_x[0]
print(sent)

Horses, asses, mules and hinnies; live, pure-bred breeding animals


In [24]:
#### Check the values of index 10 of columns name_y before dropping nulls
sent2 = df_merged.name_y[10]
sent2

'vinca major subsp. major'

In [25]:
### Display the dataset for checking
df_merged.head(15)

Unnamed: 0,id,name_x,name_y
0,0,"Horses, asses, mules and hinnies; live, pure-b...",leveillula lactucae-serriolae
1,0,"Wood in the rough, even peeled, or roughly squ...",leveillula lactucae-serriolae
2,1,"Horses; live, pure-bred breeding animals",podosphaera aphanis
3,1,"Wood in the rough, even peeled, or roughly squ...",podosphaera aphanis
4,2,"Horses; live, other than pure-bred breeding an...",lathyrus czeczottianus
5,2,"Wood in the rough, even peeled, or roughly squ...",lathyrus czeczottianus
6,3,"Wood in the rough, even peeled, or roughly squ...",crocus biflorus subsp. caricus
7,4,"Horses, asses, mules and hinnies; live, other ...",hordeum brevisubulatum
8,4,"Wood in the rough, even peeled, or roughly squ...",hordeum brevisubulatum
9,5,"Bovine animals; live, pure-bred breeding animals",vinca major subsp. major


In [26]:
### Define x and y as  empty nd  loop to loop through x and y and append to them
x = []
y = []
for x1 in df_merged["name_x"]:
  x.append(x1)

for y1 in df_merged["name_y"]:
  y.append(y1)

In [27]:
#### import TfidVectorizer to change words into numbers since deals with numbers and linear kernel cosine similirities to colect similar numbers and their indexes 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_merged.name_x)
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [28]:
### Define x and y as input data and their vlues
### For the first 100 
y = y[:100]
x = x[:100]

In [29]:
simis = [] ### empty list for where similar words will be appended to
indexess = [] ### empty list for appending indexes for similar words

def similar(keyword): ### function for similar keywords
    from gensim import corpora, models, similarities ### import gensim for creating models for similarities
    import jieba
    texts = x
    texts = [jieba.lcut(text) for text in texts] ### check the text of words
    dictionary = corpora.Dictionary(texts) ### create dictionary of words
    feature_cnt = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #### change the text of words into numbers
    tfidf = models.TfidfModel(corpus) 
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))
    ### create the index of similar words to append into the empty list
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt)
    sim = index[tfidf[kw_vector]]
    
### loop for checking similar keywords with similarities and indexes nd create a new list
    similarities = []
    my_list = []
    for i in range(len(sim)):
        similarities.append(sim[i])
        my_list.append((i+1, sim[i]))

    max_value = None

    for num in similarities:
        if (max_value is None or num > max_value):
            max_value = num
### loop through a new list to compare their similar index max value and append them
    for value in my_list:
        if max_value == value[1]:
            global simi, indexes
            indexes = value[0]
            indexess.append(indexes)
            indexk = y.index(keyword)
            simis.append(indexk)
           
### loop for keyword in y column for similar keyword in x column
for word in y:
    keyword = word
    similar(keyword)



Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.661 seconds.
Prefix dict has been built successfully.


In [30]:
#### Print indexes
print(len(indexess))

991


In [31]:
### print the lengths of the similar words
print(len(simis))

991


In [32]:
### display the indexes and their similarities as a dataframe
df = pd.DataFrame({'source_1': indexess,
                   'source_2': simis,
                  })

In [34]:
### Display the dtaframe and print output into  csv file
print(df.head(50))
df.to_csv('rovinewnjala99@gmail.com.csv')

    source_1  source_2
0          3         0
1          3         0
2         39         2
3         39         2
4         39         4
5         39         4
6         84         6
7         39         7
8         39         7
9         84         9
10        84         9
11        39        11
12        39        11
13         1        13
14         2        13
15         3        13
16         4        13
17         5        13
18         6        13
19         7        13
20         8        13
21         9        13
22        10        13
23        11        13
24        12        13
25        13        13
26        14        13
27        15        13
28        16        13
29        17        13
30        18        13
31        19        13
32        20        13
33        21        13
34        22        13
35        23        13
36        24        13
37        25        13
38        26        13
39        27        13
40        28        13
41        29        13
42        3