In [None]:
#Calculates the similarity scores for each pair of names present in a lot based on two algorithms 'Levenshtein distance'
#and 'partial ratio' algorithm
!pip install rapidfuzz
import pandas as pd
from rapidfuzz import fuzz

# Load the excel file into a DataFrame
df = pd.read_excel('991_names_980_plus156_minus_145.xlsx')

# Assuming your CSV file has one column named 'Names'
titles = df['Names']

# Initialize an empty list to store the similarity scores
similarity_scores1 = []
similarity_scores2 = []
# Calculate the similarity scores for each title with all other titles
for title1 in titles:
    scores1 = [fuzz.ratio(title1, title2) for title2 in titles]
    similarity_scores1.append(scores1)
    scores2 = [fuzz.partial_ratio(title1, title2) for title2 in titles]
    similarity_scores2.append(scores2)


# Create a DataFrame with the similarity scores
similarity_df1 = pd.DataFrame(similarity_scores1, columns=titles)
similarity_df2 = pd.DataFrame(similarity_scores2, columns=titles)

# Set the index of the DataFrame to match the titles
similarity_df1.index = titles
similarity_df2.index = titles


# Save the DataFrame to a CSV file
similarity_df1.to_excel('similarity_matrix_991_LV.xlsx')
similarity_df2.to_excel('similarity_matrix_991_PR.xlsx')

print("Done")


In [None]:
Explanation of the above code:

This code performs the following tasks:

1. Imports necessary libraries:
   - `pandas` for data manipulation using DataFrames.
   - `fuzz` from `rapidfuzz` for calculating string similarity scores.

2. Loads data from an Excel file ('2959_similar_names.xlsx') into a pandas DataFrame (`df`).

3. Extracts a column named 'Names' from the DataFrame and assigns it to the variable `titles`.

4. Initializes an empty list `similarity_scores` to store similarity scores between pairs of names.

5. Computes the similarity scores for each pair of names using the `fuzz.ratio` function from the `rapidfuzz` library. The similarity scores are calculated for each name (`title1`) against all other names (`title2`) in the dataset.

6. Appends the list of similarity scores for each name to the `similarity_scores` list.

7. Creates a new DataFrame (`similarity_df`) from the list of similarity scores. The columns and index of this DataFrame are set to the names (`titles`).

8. Saves the DataFrame with similarity scores to an Excel file ('similarities_output_2959names_file3.xlsx').

9. Prints "Done" to indicate that the process is complete.

In summary, the code calculates string similarity scores between pairs of names in the input Excel file using the Levenshtein distance algorithm provided by the `fuzz.ratio` function. The results are then saved to an output Excel file.

In [None]:
#This code is used for retaining only those names with a cut-off score of greater than or equal to 80
import pandas as pd

# Read the similarity score matrix from your file
df = pd.read_excel('similarity_matrix_1136_2959.xlsx', index_col='Names1')

# Initialize an empty list to store dictionaries with names and scores between 80 and 100
selected_names_list = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Extract names and scores greater than 80
    high_scores_data = {name: score for name, score in row.items() if score >= 80}
    # Remove names already present in 'High_Score_Data'
    high_scores_data.pop(index, None)
    # Add to the list
    selected_names_list.append({'Names': index, 'High_Score_Data': high_scores_data})

# Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(selected_names_list)

# Save the DataFrame to an Excel file
result_df.to_excel('1136vs2959_greq80lessthan100_TSR.xlsx', index=False)

print("Selected names with scores between 80 and 100 have been saved to selected_names_output.xlsx")


In [None]:
Explanation of the code above:
    This code performs the following tasks:

1. Imports the necessary library, `pandas`, for data manipulation.

2. Reads a similarity score matrix from an Excel file ('2959_similarnames_similaritymatrix.xlsx') into a DataFrame (`df`). The 'Names' column is set as the index.

3. Initializes an empty list (`selected_names_list`) to store dictionaries containing names and their corresponding similarity scores falling between 80 and 100.

4. Iterates through each row in the DataFrame using `df.iterrows()`.

5. For each row, it creates a dictionary (`high_scores_data`) containing names and scores between 80 and 100 using a dictionary comprehension.

6. Removes the name corresponding to the current row index from the `high_scores_data` dictionary using `high_scores_data.pop(index, None)` to avoid including the similarity score of a name with itself.

7. Appends a dictionary to the `selected_names_list` for each row, containing the original name and the filtered high similarity score data.

8. Creates a new DataFrame (`result_df`) from the list of dictionaries.

9. Saves the resulting DataFrame to an Excel file ('selected_names_output_file3.xlsx') without including the index.

10. Prints a message indicating that the selected names with scores between 90 and 100 have been saved to the output Excel file.

In summary, the code filters names from the input similarity score matrix that have similarity scores between 80 and 100, excluding the self-similarity score. The selected names and their high similarity scores are saved to a new Excel file.

In [None]:
#This code is used for retrieving only the names, removing the associated similarity scores
import pandas as pd
import ast  # Library for safely evaluating literals or expressions

# Read the Excel file
df = pd.read_excel('991_greq80lessthan100_LV_rough.xlsx', header=None, names=['High_Score_Data'])

# Function to process each row in the 'Original' column
def process_row(row):
    try:
        # Safely evaluate literal or expression from the cell value
        data = ast.literal_eval(str(row['High_Score_Data']))
        # Extract names from the dictionary
        names = ', '.join(data.keys())
        return names
    except Exception as e:
        return f"Error: {e}"

# Apply the processing function to create a new 'Processed' column
df['Processed'] = df.apply(process_row, axis=1)

# Save the result to a new Excel file
df[['Processed']].to_excel('output_file_LV.xlsx', index=False)
print("Done")
