# Compare multiple string columns to return similarity scores and longest partial string matches

In [None]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher

In [None]:
file = 'data1.csv'
data1 = pd.read_csv(file)

In [None]:
file = 'data2.csv'
data2 = pd.read_csv(file)

In [None]:
frames = [data1, data2]
 
res_dt = pd.concat(frames).reset_index()
del res_dt['index']

In [None]:
data = res_dt.replace(np.nan, '', regex=True)

In [None]:
data.drop_duplicates(subset='name', keep='first', inplace=True)

Use SequenceMatcher and find string similarity between string columns, score is out of 100, round up (2 digits)

In [None]:
from difflib import SequenceMatcher

def sim_score(df, col1, col2):
    return round((SequenceMatcher(None, df[col1], df[col2]).ratio()*100),2)

Compute similarity score for multiple string columns, compare and return maximum similarity score

In [None]:
data['score'] = data.apply(sim_score,
                          args=('col1', 'col2'), axis=1)
data['score2'] = data.apply(sim_score,
                          args=('col1', 'col3'), axis=1)

data['max_score'] = data[['score1', 'score2']].max(axis=1)
data

In [None]:
data.to_csv("match_score.csv")

Compute similarity score for multiple string columns, and find matched longest keyword (string)

In [None]:
from difflib import SequenceMatcher

def sim_keyword(df, col1, col2):
    seqMatch=SequenceMatcher(None, df[col1], df[col2])
    match=seqMatch.find_longest_match(0, len(df[col1]), 0, len(df[col2])) 
    if (match.size!=0): 
          return (df[col1][match.a: match.a + match.size])  
    else: 
          return ''

In [None]:
data['matched_keyword'] = data.apply(sim_keyword,
                          args=('col1', 'col2'), axis=1)
data['matched_keyword2'] = data.apply(sim_keyword,
                          args=('col1', 'col3'), axis=1)

data

In [None]:
#remove '-' , numeric matches, and single character letter

data2=data[~data['matched_keyword'].str.contains('0|1|2|3|4|5|6|7|8|9')]
data2['matched_keyword'] = data2['matched_keyword'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ').str.replace('-', '')
data2

In [None]:
data2.to_csv("match keyword.csv")

This is modification version of Text Clustering and Text Similarity approaches available from the following post: http://www.lumenai.fr/blog/quick-review-on-text-clustering-and-text-similarity-approaches

Great example of k-means clustering using tf-idf vectors with the scikit-learn implementation

In [None]:
import collections
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint


def word_tokenizer(text):
            #tokenizes and stems the text
            tokens = word_tokenize(text)
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
            return tokens


def cluster_sentences(sentences, nb_of_clusters):
            tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                            stop_words=stopwords.words('english'),
                                            max_df=0.9,
                                            min_df=0.1,
                                            lowercase=True)
            #builds a tf-idf matrix for the sentences
            tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
            kmeans = KMeans(n_clusters=nb_of_clusters)
            kmeans.fit(tfidf_matrix)
            clusters = collections.defaultdict(list)
            for i, label in enumerate(kmeans.labels_):
                    clusters[label].append(i)
            return dict(clusters)

Return in format of dataframe, with based on name/sentence similarity in the column, cluster them into groups of name/sentence, and returning cluster group number for each corresponding name/sentence. This can be implement to check sentence similarity, and clustering

In [None]:
if __name__ == "__main__":
            sentences = data['name']
            nclusters= 10
            clusters = cluster_sentences(sentences, nclusters)
            cols = ['cluster group','name']
            dat = pd.DataFrame(columns = cols)
            for cluster in range(nclusters):
                    for i,sentence in enumerate(clusters[cluster]):
                        dat = dat.append({'cluster group': cluster, 'name':sentences[sentence]}, ignore_index=True)
dat

## Elbow method to select number of clusters

The elbow method is a used to estimate the optimal number of clusters k. Usually, if k increases, the within-cluster SSE (“distortion”) will decrease. This is because the samples will be closer to the centroids they are assigned to. The idea is to identify the value of k where the distortion begins to decrease most rapidly.

Read more: https://towardsdatascience.com/k-means-clustering-with-scikit-learn-6b47a369a83c

In [None]:
from sklearn.cluster import KMeans
wcsse = []
sentences = data['name']
tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer,
                                            stop_words=stopwords.words('english'),
                                            max_df=0.9,
                                            min_df=0.1,
                                            lowercase=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
for i in range(1,11):
    kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    kmeans.fit(tfidf_matrix)
    wcsse.append(kmeans.inertia_)
plt.plot(range(1,11),wcsse)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSSE')
plt.savefig('elbow.png')
plt.show()