In [696]:
import csv
import pandas as pd
import pycountry
import ftfy
import re

cleaned_filename = "cleaned_google_dataset.csv"
countries=pd.read_csv("countryCodes.csv",engine="python",quotechar='"',dtype=str)
zipPatterns = pd.read_csv("regexZipCodesForCountries.csv", engine="python", dtype=str, on_bad_lines="warn", quotechar='"', escapechar='௸')
with open("google_dataset.csv", "r", encoding="utf-8") as infile, open(cleaned_filename, "w", newline="", encoding="utf-8") as outfile:
    reader = csv.reader(infile, quotechar='"', delimiter=',', skipinitialspace=True)
    writer = csv.writer(outfile, quotechar='"', delimiter=',')
    for row in reader:
        try:
            # Attempt to write row to the clean CSV file
            writer.writerow(row)
        except Exception as e:
            # Log problematic rows (optional)
            print(f"Skipping bad row: {row}, due to error: {e}")



In [697]:
def transformArrayintoString(array):
    value=""
    for elem in array:
        value=value+elem+","
    return value[:-1]

def is_region_name(regionName, countryCode):
    if(pycountry.subdivisions.get(country_code=countryCode)is not None):
        for region in pycountry.subdivisions.get(country_code=countryCode):
            if regionName.lower() == region.name.lower():
                return True
        return False
    
def is_region_code(region_code):
    region_code = region_code.upper()
    for subdivision in pycountry.subdivisions:
        if subdivision.code == region_code:
            return True  
    return False 

def country_in_array(array):
    for elem in array:
        if(is_country_name(elem)==True):
            return elem
    return False

def countryCode_in_array(array):
    for elem in array:
        if(is_country_code(elem)==True):
            return elem
    return False

def get_region_code(region_name, country_code):
    region_name = region_name.lower()
    country_code = country_code.upper()
    if(pycountry.subdivisions.get(country_code=country_code)is not None):
        for subdivision in pycountry.subdivisions.get(country_code=country_code):
            # Check if the subdivision's name matches the region name
            if subdivision.name.lower() == region_name:
                return subdivision.code.split('-')[1]  # Return the region code
    return None

def is_zip_code(zipCode, countryCode):
    zipCode = zipCode.upper()
    countryCode = countryCode.upper()
    realZipCodePattern = zipPatterns[zipPatterns['Code'] == countryCode]['Pattern']    
    if not realZipCodePattern.empty:
        pattern = realZipCodePattern.iloc[0]  # Get the first pattern as a string
        # print(f"Pattern for {countryCode}: {pattern}")
        return re.match(pattern, zipCode) is not None  # Return True or False
    return False  # Return False if no pattern is found

def is_country_code(countryCode):
    countryCode=countryCode.upper()
    if not zipPatterns[zipPatterns['Code'] == countryCode].empty:
        return True
    return False

def is_country_name(countryName):
    countryName=countryName.lower()
    return countryName in countries["Name"].values

def phoneNumber(phoneNumberCandidate):
    cleanedNumber=phoneNumberCandidate.replace('-', '').replace(' ', '').replace('(', '').replace(')', '')
    if ('(' in phoneNumberCandidate and ')' in phoneNumberCandidate):
        if(len(cleanedNumber)==10 and re.match(r"^\d{10}$",cleanedNumber)):
            return "+"+phoneNumberCandidate.replace('-', '').replace(' ', '').replace('(', '').replace(')', '')
    elif (len(cleanedNumber.replace('+', ''))==11 and re.match(r"^\d{11}$",cleanedNumber.replace('+', ''))):
        return "+"+phoneNumberCandidate.replace('-', '').replace(' ', '').replace('(', '').replace(')', '').replace('+', '')
    return "WRONG"

def experience_detail_regex(experienceDetailCandidate):
    pattern=r'(\d+)\+ years in \w+ '
    match=re.search(pattern,experienceDetailCandidate)
    return match if match is not None else None

def clean_address(address):
    # Use regex to remove everything from "·" onwards, including leading spaces
    cleaned_address = re.sub(r'^\s*·.*$', '', address)
    return cleaned_address.strip()  # Strip leading/trailing whitespace

def checkPhoneSimple(simplePhone):
    if(len(simplePhone)!=12):
        return False
    return True

def is_scientific_notation(value):
    pattern = r'^\d+\.?\d*E[+-]?\d+$'
    return bool(re.match(pattern, value))

def format_phone_number(sci_notation):
    phone_number = float(sci_notation)
    phone_str = str(int(phone_number))
    if len(phone_str) >= 10:
        return str(f"{phone_str[:3]}{phone_str[3:6]}{phone_str[6:]}")
    else:
        return "Invalid phone number length"
    
def find_reviews(text):
    if "no reviews" in text:
        return False
    else:
        return re.search(r'\((\d+)\)', text)
    
def find_rating(text):
    return re.search(r'\d\.\d', text)
    
    




def process_chunk(chunk):
    indexOfExperience={}
    chunk.fillna('', inplace=True)
    chunk = chunk.map(lambda x: re.sub(r'\s+', ' ', x).strip() if isinstance(x, str) else x)
    chunk = chunk.map(lambda x: ftfy.fix_text(x) if isinstance(x, str) else x)
    chunk["address"]=chunk["address"].apply(str.lower)
    chunk["raw_address"]=chunk["raw_address"].apply(str.lower)
    chunk["text"]=chunk["text"].apply(str.lower)
    specialColumnsAddress=["country_name","region_name","zip_code","city","raw_phone","raw_address","phone"]
    specialColumnsAddressCodes=["country_code","region_code"]

    for index,record in chunk.iterrows():
        adressDetails=record["address"]
        actualPhoneNumber=record["phone"]
        rawAddresss=record["raw_address"]
        text=record["text"]
        rawPhone=record["raw_phone"]
        if(is_scientific_notation(actualPhoneNumber)==True):
            actualPhoneNumber=format_phone_number(actualPhoneNumber)
        if(checkPhoneSimple(actualPhoneNumber)==False):
            actualPhoneNumber=""
        if(record["country_name"]=="" and record["country_code"]!=""):
            chunk.at[index,"country_name"]=countries[countries["ISO"]==record["country_code"].lower()]["Name"].values[0] if countries[countries["ISO"]==record["country_code"].lower()]["Name"].empty==False else record["country_name"]
            record["country_name"]=countries[countries["ISO"]==record["country_code"].lower()]["Name"].values[0] if countries[countries["ISO"]==record["country_code"].lower()]["Name"].empty==False else record["country_name"]
        if(record["country_code"]=="" and record["country_name"]!=""):
            chunk.at[index,"country_code"]=countries[countries["Name"]==record["country_name"].lower()]["ISO"].values[0] if countries[countries["Name"]==record["country_name"].lower()]["ISO"].empty==False else record["country_code"]
            record["country_code"]=countries[countries["Name"]==record["country_name"].lower()]["ISO"].values[0] if countries[countries["Name"]==record["country_name"].lower()]["ISO"].empty==False else record["country_code"]
        separatedAddressDetails=adressDetails.split(",")
        unique=[]
        print(index)
        print(adressDetails)
        for detail in separatedAddressDetails:
            detail=detail.strip()
            if detail in record["text"] or detail in record["raw_address"] or detail in record["phone"] or detail in record["raw_phone"]  or detail in record["region_name"]  or detail in record["country_name"] or detail in record["city"]:
                detail=""
            elif record["zip_code"]in detail:
                detail=detail.replace(record["zip_code"],'')
                unique.append(detail)
            else:
                unique.append(detail)
        extraUnique=[]
        for detail in unique:
            print(f"este {detail} la fel cu {record["country_code"]} sau {record["region_code"]}?")
            print("\n")
            detail=detail.strip()
            if detail==record["country_code"] or detail==record["region_code"] or detail==record["country_name"] or detail==record["region_name"]:
                print("DAAAA")
            else:
                extraUnique.append(detail) 
        adressDetails=""
        for elem in extraUnique:
                adressDetails=adressDetails+elem.strip()+","
        adressDetails=adressDetails[:-1]
        print(adressDetails)
        

        
        # for specialColumn in specialColumnsAddress:
        #     chunk[specialColumn]=chunk[specialColumn].apply(str.lower)
        #     if record[specialColumn] in adressDetails:
        #         adressDetails=adressDetails.replace(record[specialColumn],'')

        # if adressDetails in record["text"].lower() or adressDetails in record["raw_address"] or re.match(r'^\d+$',adressDetails) or phoneNumber(adressDetails)!="WRONG":
        #     adressDetails=""
        # adressDetails=adressDetails.replace(',',' ').strip()


        # if(experience_detail_regex(adressDetails) is not None):                                                                         #Up until this point the address is 
        #     if index not in indexOfExperience.keys():
        #         indexOfExperience[index]=experience_detail_regex(adressDetails).group(1)
        #     adressDetails=adressDetails.replace(experience_detail_regex(adressDetails).group(),'')
        
        # for specialColumnCode in specialColumnsAddressCodes:         
        #     if record[specialColumnCode].lower() in adressDetails.split(' '):
        #         adressDetails=adressDetails.replace(record[specialColumnCode],'')
        
        # adressDetails=adressDetails.replace("-",'')
        # adressDetails=adressDetails.replace("city ok",'') if "city ok" in adressDetails else adressDetails
        # adressDetails=adressDetails.strip()
        
        # if adressDetails in record["text"].lower() or adressDetails in record["raw_address"] or re.match(r'^\d+$',adressDetails) or phoneNumber(adressDetails)!="WRONG":
        #     adressDetails=""
        # adressDetails=clean_address(adressDetails) if '·' in adressDetails else adressDetails
        # print(adressDetails)
        chunk.at[index,"address"]=adressDetails
        chunk.at[index,"phone"]=actualPhoneNumber
        chunk.at[index,"phone_country_code"]=countries[countries["ISO"]==record["country_code"]]['Code'].values[0] if countries[countries["ISO"]==record["country_code"]]['Code'].empty==False else ""
        rawAddresss=re.sub(r'(\d+)\+ years in \w+ · ','',rawAddresss)
        rawAddresss=rawAddresss.replace('·','')
        rawAddresss=rawAddresss.replace(rawPhone,'')
        if phoneNumber(rawPhone)=="WRONG" and actualPhoneNumber!="":
            rawPhone=actualPhoneNumber
        chunk.at[index,"raw_address"]=rawAddresss
        chunk.at[index,"raw_phone"]=rawPhone
        matchNrReviews = find_reviews(text)
        matchRating=find_rating(text)
        matchExperience=experience_detail_regex(text)
        if rawPhone in text:
            text=text.replace(rawPhone,'')
        if matchNrReviews:
            text = text.replace(matchNrReviews.group(), '') 
        if matchRating:  # Make sure to call .group() here as well
            text = text.replace(matchRating.group(), '')  # Replacing the full match
        if matchExperience:
            text = text.replace(matchExperience.group(), '')
        if "no review" in text:
            text=text.replace("no review",'')
        if text.startswith("s ·"):
            text=text[2:]
        text=text.replace("\\",'')
        text=text.replace('"','')
        text=text.strip()
        

 

        chunk.at[index, "number_reviews"] = matchNrReviews.group(1) if matchNrReviews else "No reviews"
        chunk.at[index, "customer_rating"]=matchRating.group() if matchRating else "No reviews"
        chunk.at[index, "years_experience"]=matchExperience.group(1) if matchExperience else "No experience"
        chunk.at[index,"text"]=text




    # for specialColumn in specialColumns:
    #     chunk[specialColumn]=chunk[specialColumn].apply(str.lower)

    #                                     #Avem nume => avem cod , avem cod => avem nume # GOOD
    # for index,record in chunk.iterrows():
    #     if(record["country_name"]=="" and record["country_code"]!=""):
    #         chunk.at[index,"country_name"]=countries[countries["ISO"]==record["country_code"].lower()]["Name"].values[0] if countries[countries["ISO"]==record["country_code"].lower()]["Name"].empty==False else ""
    #     if(record["country_code"]=="" and record["country_name"]!=""):
    #         chunk.at[index,"country_code"]=countries[countries["Name"]==record["country_name"].lower()]["ISO"].values[0] if countries[countries["Name"]==record["country_name"].lower()]["ISO"].empty==False else ""

    # for index, record in chunk.iterrows():
    #     address = record["address"]
    #     copyAddress=address
    #     for specialColumn in specialColumns:
    #         specialDetail=record[specialColumn]                                         #o idee mult mai buna
    #         if(specialDetail in address):
    #             print(specialDetail+" chiar este in "+ address)
    #             address=address.replace(specialDetail,'')
    #             print(address+"\n")
    #     chunk.at[index,"address"]=address.strip(',')
                
    return chunk


# google dataset is harder to clean than facebook . i dont know how to retrieve data from address

    


In [698]:

df_all = pd.DataFrame()
chunk_iter = pd.read_csv("cleaned_google_dataset.csv", dtype=str, chunksize=100, quotechar='"', on_bad_lines="skip", engine="python")

for chunk in chunk_iter:
    df_all = pd.concat([df_all, process_chunk(chunk)], ignore_index=True)

df_all.to_csv("google.csv",index=False,encoding="utf-8-sig")
print(df_all.columns)


0
28 central coast hwy, west gosford nsw 2250, australia
este 28 central coast hwy la fel cu au sau nsw?


este west gosford nsw  la fel cu au sau nsw?


28 central coast hwy,west gosford nsw
1
400 scott st, st. catharines, on l2m 3w2, canada
este on  la fel cu ca sau on?


DAAAA

2
191 pleasant st, yarmouth, ns b5a 2j9, canada
este 191 pleasant st la fel cu ca sau ns?


este ns  la fel cu ca sau ns?


DAAAA
191 pleasant st
3
11040 santa monica blvd suite 370, los angeles, ca 90025, united states
este ca  la fel cu us sau ca?


DAAAA

4


5
ferndale, mi, united states

6
321 kent st, sydney nsw 2000, australia
este 321 kent st la fel cu au sau nsw?


este sydney nsw  la fel cu au sau nsw?


321 kent st,sydney nsw
7
55 s cleveland ave, westerville, oh 43081
este 55 s cleveland ave la fel cu us sau oh?


este oh  la fel cu us sau oh?


DAAAA
55 s cleveland ave
8
4050 s torrey pines dr, las vegas, nv 89103, united states
este 4050 s torrey pines dr la fel cu us sau nv?


este nv  la fel c

In [699]:
# import pandas as pd

# # Read the CSV file
# df = pd.read_csv("google.csv", on_bad_lines="warn", dtype=str, engine="python", quotechar='"')
# specialColumns = ["country_name", "country_code", "region_name", "region_code", "zip_code", "city"]

# # Fill missing values with empty strings
# df.fillna("", inplace=True)

# # Iterate over each row
# for index, record in df.iterrows():
#     adressDetails = record["address"].split(',')
#     newAddressDetails=[]
#     for detail in adressDetails:
#         detail=detail.strip()
#         newDetail=""
#         elementsInAdressDetail=detail.split()
#         for elem in elementsInAdressDetail:
#             unique=True
#             for specialColumn in specialColumns:
#                 if elem==record[specialColumn]:
#                     unique=False
#                     break
#             if(unique==True):
#                 newDetail=newDetail+elem
#                 newDetail=newDetail+" "
#         newAddressDetails.append(newDetail[:-1])
#     df.at[index,"address"]=transformArrayintoString(newAddressDetails)
# def groups_of_two(adresa):
    

# for index,record in df.iterrows():
#     adressDetails = record["address"].split(',')
#     newAdress=""
#     for detail in adressDetails:
#         detail=detail.strip()
#         unique=True
#         for specialColumn in specialColumns:
#             if record[specialColumn]==detail:
#                 unique=False
#                 break
#         if unique==True:
#             newAdress=newAdress+detail
#             newAdress=newAdress+","
#     df.at[index,"address"]=newAdress[:-1]



# df.to_csv("google1.csv",encoding="utf-8-sig",index=False)


In [700]:
import re

def clean_address(address):
    # Use regex to remove everything from "·" onwards, including leading spaces
    cleaned_address = re.sub(r'\s*·.*$', '', address)
    return cleaned_address.strip()  # Strip leading/trailing whitespace

# Example addresses
addresses = [
    "city ok · in woodlands office park",
    "city ok · in woodlands",
    "city ok · in woodjnijsa hasdhuas dahshi"
]

# Clean addresses
for addr in addresses:
    print(clean_address(addr))


city ok
city ok
city ok


In [701]:
import re

def find_rating(text):
    match = re.search(r'^\d+\.\d+', text)
    return match.group() if match else None

text = "2.8 (4) · government office clinton, ut closed ⋅ opens 8am mon · (801) 614-0800"

rating = find_rating(text)

print(rating)


2.8


Arkona, ON, Canada Â· +1 519-828-3071
