In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
# Input data
df = pd.read_csv('Aliasing_Master.csv')

In [29]:
df.tail()

Unnamed: 0,PID,SPI,PDS
129,JE-CRC301-TT-130064A,C301FE 060009,MH-CRC301-M160001
130,JE-CRC301-TT-200068B,C301FT 060009,ES-CRC301-004-8FB
131,JE-CRC301-PSV-060040,C301FIC 060009,JWCRC301BC00453E1
132,JE-CRC301-PT-150086,C301FV 060009,JWCRC301CE0001FFD
133,JE-CRC301-PV-175010,C301FE 060010,JB-CRC301-AH3008-E


In [30]:
# Set cutoff score for similarity
cutoff_score = 0.8

# Convert data to lowercase
df['PID'] = df['PID'].str.lower()
df['SPI'] = df['SPI'].str.lower()

# Create tokenizer and fit on feature data
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(str(df['PID']) + str(df['SPI']))

# Convert feature data to sequences
sequences_PID = tokenizer.texts_to_sequences(str(df['PID']))
sequences_SPI = tokenizer.texts_to_sequences(str(df['SPI']))

# Pad sequences to have equal length
max_length = max(len(seq) for seq in sequences_PID + sequences_SPI)
padded_sequences_PID = pad_sequences(sequences_PID, maxlen=max_length)
padded_sequences_SPI = pad_sequences(sequences_SPI, maxlen=max_length)

# Create character-level embeddings
embedding_dim = 100  # Adjust the dimension as needed
num_chars = len(tokenizer.word_index) + 1
embeddings = np.zeros((num_chars + 1, embedding_dim))
for char, char_index in tokenizer.word_index.items():
    embeddings[char_index] = np.random.uniform(-1, 1, embedding_dim)

# Reshape arrays for similarity calculation
reshaped_PID = embeddings[padded_sequences_PID.flatten()].reshape(len(df), -1)
reshaped_SPI = embeddings[padded_sequences_SPI.flatten()].reshape(len(df), -1)

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(reshaped_PID, reshaped_SPI)

# Find the best match and all matches for each observation
best_matches = []
all_matches = []
for i in range(len(df)):
    similarity_scores = similarity_matrix[i]
    best_match_index = np.argmax(similarity_scores)
    best_match_score = np.max(similarity_scores)
    best_match = df['SPI'][best_match_index]
    all_matches_indices = np.where(similarity_scores >= cutoff_score)[0]
    all_matches_text = [df['SPI'][index] for index in all_matches_indices]
    best_matches.append(best_match)
    all_matches.append(all_matches_text)

# Create new features in the dataframe
df['BestMatch'] = best



ValueError: cannot reshape array of size 34500 into shape (134,newaxis)