[Prerequisite](https://github.com/SatadruMukherjee/Data-Preprocessing-Models/blob/main/String_similarity_using_Fuzzy.ipynb)

In [0]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Obtaining dependency information for fuzzywuzzy from https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import FloatType
from fuzzywuzzy import fuzz



In [0]:
# Sample data
data =[# Case 1: Same person (name & address variations)
(1, "Cody Johnson", 80, "8 Jefrey Brac", "St. Lisatown", "2636", "South Australia"),
(2, "Cody Jonson", 80, "8 Jeffrey Brace", "St. Lisatown", "2636", "South Australia"),
(3, "Kody Johnson", 80, "8 Jeffrey Brace", "St. Lisatown", "2636", "South Australia"),

# Case 2: Same person, slightly different addresses
(4, "Angela Watson", 59, "3/752 Bernard Follow", "Janicebrg", "2995", "Australian Capital Territory"),
(5, "Angela Watson", 59, "752 Bernard Follow", "Janiceberg", "2995", "Australian Capital Territory"),

# Case 3: Different people, same address
(6, "Michael Hunt", 69, "8 Santana Rest", "St. Jessicamouth", "2964", "Queensland"),
(7, "Sarah Hunt", 69, "8 Santana Rest", "St. Jessicamouth", "2964", "Queensland"),

# Case 4: Edge - exact match
(8, "Liam Smith", 35, "21 Wallaby Way", "Sydney", "2000", "New South Wales"),
(9, "Liam Smith", 35, "21 Wallaby Way", "Sydney", "2000", "New South Wales"),

# Case 5: Same name, different people
(10, "Chris Brown", 50, "14 Beach Rd", "Perth", "6000", "Western Australia"),
(11, "Chris Brown", 27, "45 Park St", "Perth", "6000", "Western Australia"),]

columns = ["id", "name", "age", "address_line1", "city", "postcode", "state"]

df = spark.createDataFrame(data, columns)

display(df)

id,name,age,address_line1,city,postcode,state
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia
2,Cody Jonson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia
3,Kody Johnson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia
4,Angela Watson,59,3/752 Bernard Follow,Janicebrg,2995,Australian Capital Territory
5,Angela Watson,59,752 Bernard Follow,Janiceberg,2995,Australian Capital Territory
6,Michael Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland
7,Sarah Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland
8,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales
9,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales
10,Chris Brown,50,14 Beach Rd,Perth,6000,Western Australia


### Compare every customer with every other customer

In [0]:
# Self join on id condition to avoid duplicate and self-pairing
dfA = df.alias("a")
dfB = df.alias("b")
pairs = dfA.join((dfB), col("a.id") != col("b.id"))

display(pairs.orderBy("a.id","b.id"))

id,name,age,address_line1,city,postcode,state,id.1,name.1,age.1,address_line1.1,city.1,postcode.1,state.1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,2,Cody Jonson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,3,Kody Johnson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,4,Angela Watson,59,3/752 Bernard Follow,Janicebrg,2995,Australian Capital Territory
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,5,Angela Watson,59,752 Bernard Follow,Janiceberg,2995,Australian Capital Territory
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,6,Michael Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,7,Sarah Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,8,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,9,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,10,Chris Brown,50,14 Beach Rd,Perth,6000,Western Australia
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,11,Chris Brown,27,45 Park St,Perth,6000,Western Australia


### Weighted fuzzy matching

We have more confidence in some columns of our data than others, and therefore want to prioritize their similarity results.

By adding a scaling factor to each of our column fuzzy matches, we can create a weighted fuzzy match for a record.

In [0]:
# Weighted fuzzy score UDF
def weighted_score(row1, row2):
  weights = [0.3, 0.5, 0.1, 0.1]
  return (
  fuzz.partial_ratio(row1[0], row2[0]) * weights[0] +
  fuzz.partial_ratio(row1[1], row2[1]) * weights[1] +
  fuzz.partial_ratio(row1[2], row2[2]) * weights[2] +
  fuzz.partial_ratio(row1[3], row2[3]) * weights[3])

fuzzy_udf = spark.udf.register("fuzzy_udf", weighted_score, FloatType())

In [0]:
# Apply fuzzy score
pairs = pairs.withColumn("similarity",
fuzzy_udf(
array("a.name", "a.address_line1", "a.city","a.postcode"),
array("b.name", "b.address_line1", "b.city","b.postcode")
)).orderBy("a.id","b.id")

display(pairs)

id,name,age,address_line1,city,postcode,state,id.1,name.1,age.1,address_line1.1,city.1,postcode.1,state.1,similarity
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,2,Cody Jonson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,93.3
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,3,Kody Johnson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,93.6
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,4,Angela Watson,59,3/752 Bernard Follow,Janicebrg,2995,Australian Capital Territory,31.2
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,5,Angela Watson,59,752 Bernard Follow,Janiceberg,2995,Australian Capital Territory,31.0
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,6,Michael Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,27.2
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,7,Sarah Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,27.8
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,8,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,26.5
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,9,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,26.5
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,10,Chris Brown,50,14 Beach Rd,Perth,6000,Western Australia,40.6
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,11,Chris Brown,27,45 Park St,Perth,6000,Western Australia,33.6


In [0]:
from pyspark.sql import Window

pairs=pairs.withColumn("group_id",when(col("similarity")>80,least(col("a.id"),col("b.id"))).otherwise(col("a.id")))
display(pairs)

id,name,age,address_line1,city,postcode,state,id.1,name.1,age.1,address_line1.1,city.1,postcode.1,state.1,similarity,group_id
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,2,Cody Jonson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,93.3,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,3,Kody Johnson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,93.6,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,4,Angela Watson,59,3/752 Bernard Follow,Janicebrg,2995,Australian Capital Territory,31.2,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,5,Angela Watson,59,752 Bernard Follow,Janiceberg,2995,Australian Capital Territory,31.0,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,6,Michael Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,27.2,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,7,Sarah Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,27.8,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,8,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,26.5,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,9,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,26.5,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,10,Chris Brown,50,14 Beach Rd,Perth,6000,Western Australia,40.6,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,11,Chris Brown,27,45 Park St,Perth,6000,Western Australia,33.6,1


In [0]:
pairs = pairs.select("a.id","a.name","a.age","a.address_line1","a.city","a.postcode","a.state","group_id").orderBy("a.id")

display(pairs)

id,name,age,address_line1,city,postcode,state,group_id
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1


In [0]:
window = Window.partitionBy("id").orderBy(col("group_id"))

pairs = pairs.withColumn("rank", row_number().over(window)).filter(col("rank") == 1).drop("rank").orderBy("group_id")
display(pairs)

id,name,age,address_line1,city,postcode,state,group_id
1,Cody Johnson,80,8 Jefrey Brac,St. Lisatown,2636,South Australia,1
2,Cody Jonson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,1
3,Kody Johnson,80,8 Jeffrey Brace,St. Lisatown,2636,South Australia,1
4,Angela Watson,59,3/752 Bernard Follow,Janicebrg,2995,Australian Capital Territory,4
5,Angela Watson,59,752 Bernard Follow,Janiceberg,2995,Australian Capital Territory,4
6,Michael Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,6
7,Sarah Hunt,69,8 Santana Rest,St. Jessicamouth,2964,Queensland,6
8,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,8
9,Liam Smith,35,21 Wallaby Way,Sydney,2000,New South Wales,8
10,Chris Brown,50,14 Beach Rd,Perth,6000,Western Australia,10
