In [1]:
import itertools
import random
import string
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import heapq
import re

# Тестовая выборка

Создание тестовой выборки значений

In [2]:
def generateTestData(num_samples: int = 1000):
    letters = 'TYOPAHKXCBME'
    results = []

    for i in range(num_samples // 2):

        # генерация 8 цифр
        result1 = ''.join(random.choices(string.digits, k=8))
        results.append(result1)

        # генерация автомобильных номеров
        part1 = ''.join(random.choices(letters, k=1))
        part2 = ''.join(random.choices(string.digits, k=3))
        part3 = ''.join(random.choices(letters, k=2))
        result = part1 + part2 + part3
        results.append(result)

        # генерация 4 символов
        if i * 10 < num_samples:
            result2 = ''.join(random.choices(string.digits, k=4))
            results.append(result2)

    return sorted(list(set(results)))



if __name__ == "__main__":

    # выборка на 400.000 значений
    basicData = generateTestData(400000)

    # вывод 10 случайных элементов тестовой выборки
    for _ in range(10):
        print(random.choice(basicData))
    print(f"объем {len(basicData)}")

37175864
57999541
M697CX
X326TT
98341779
H753MY
E170AM
44589831
A248EH
T559OO
объем 398587


# Сам алгоритм

In [3]:
def import_csv(path):
  df = pd.read_csv(path)
  df = pd.concat([pd.DataFrame({'index': df['Unnamed: 0']}), df.drop(columns={'Unnamed: 0'})], axis=1)
  df = df.set_index('index')
  return df

In [5]:
df = import_csv('similarity_csv.csv')

In [14]:
def fuzzy_finding(input_string, df, k=20, time_limit=None):
    if k==None:
      k = np.inf

    input_string = [c for c in input_string]
    inds = [index for index,val in enumerate(input_string) if val not in ['*','?']]

    current_str = "".join([val for val in input_string if val not in ['*','?']])

    j = 0

    curr_df = df.loc[[el for el in current_str],:]

    similarity_array  = np.array(curr_df.iloc[0:len(current_str),0:3], dtype=float)
    similarity_letters= np.array(curr_df.iloc[0:len(current_str),3:], dtype=object)

    n = similarity_array.shape[0]  # число "букв"

    start = tuple([0] * n)
    start_sum = float(np.power(np.sqrt(similarity_array[np.arange(n), start]).sum(), 2))

    heap = [(-start_sum, start)]

    # множество уже добавленных комбинаций, чтобы не было дублей
    used = {start}

    startTime = time.time()

    total_scoots = []
    while (j < k):
      if not heap:
        break

      neg_sum, combo = heapq.heappop(heap)
      string = "".join(list(similarity_letters[np.arange(n), combo]))

      for i, value in zip(inds, string):
        input_string[i] = value
      result = "".join(input_string).replace('*', '.+').replace('?', '.')

      for scooter_num in basicData:
        if ((scooter_num not in total_scoots) and (re.search(result, scooter_num))):
          print(scooter_num)
          total_scoots.append(scooter_num)
          j+=1
          if j>=k:
            break

      # Породить соседей
      count=0
      for pos in range(n):
        if (combo[pos] < 2) and (similarity_array[pos, combo[pos]+1] != 0):  # индексы 0,1,2
          new_combo = list(combo)
          new_combo[pos] += 1
          new_combo = tuple(new_combo)

          if new_combo not in used:
            used.add(new_combo)
            new_sum = float(similarity_array[np.arange(n), new_combo].sum())
            heapq.heappush(heap, (-new_sum, new_combo))
        else:
          count+=1
      if count==n:
        break

      if (time_limit!=None) and (time.time() - startTime >= time_limit):
        break

    endTime = time.time()
    if total_scoots == []:
      print("\nНичего не нашлось!")
    print(f"\nвремя работы: {round(endTime - startTime,4)} с")

In [17]:
try:
  fuzzy_finding(input(), df, k=20, time_limit=2)
except:
  print("Плохо!")

O*53*O
O053PO
O053TO
O153KO
O253EO
O253XO
O353HO
O453PO
O453YO
O553AO
O553HO
O553KO
O653CO
O653EO
O653HO
O753TO
O753YO
O853KO
O953KO
O953OO
00153606

время работы: 1.0969 с
