In [None]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import pandas as pd
from difflib import SequenceMatcher

# Read the CSV file
df = pd.read_csv("/content/12310066_data.csv")

# Add a unique identifier column
df['unique_id'] = range(1, len(df) + 1)

# Convert the 'skills' column from string to list
df['skills'] = df['skills'].apply(eval)

# Define a function to sort skills in each row
def sort_skills(skills_list):
    return sorted(skills_list)

# Apply the sort_skills function to each row in the 'skills' column
df['sorted_skills'] = df['skills'].apply(sort_skills)

# Define columns to check for duplicates
columns_to_check_duplicates = ['jobtitle', 'sorted_skills']

# Convert 'sorted_skills' column to tuples for checking duplicates
df['sorted_skills'] = df['sorted_skills'].apply(tuple)

# Use drop_duplicates with the subset parameter
df_no_duplicates = df.drop_duplicates(subset=columns_to_check_duplicates, keep='first').copy()

# Convert 'sorted_skills' column back to lists
df_no_duplicates['sorted_skills'] = df_no_duplicates['sorted_skills'].apply(list)

# Define a function to calculate similarity ratio using difflib
def similarity_ratio(s1, s2):
    return SequenceMatcher(None, s1, s2).ratio()

# Find and group similar job titles
# Adjust the threshold based on your needs
threshold = 0.7

groups = []
grouped_titles = set()

for title in df_no_duplicates['jobtitle']:
    if title not in grouped_titles:
        similar_titles = [t for t in df_no_duplicates['jobtitle'] if similarity_ratio(title, t) > threshold]
        groups.append(similar_titles)
        grouped_titles.update(similar_titles)

# Assign group IDs to the DataFrame
group_mapping = {title: i for i, group in enumerate(groups) for title in group}
df_no_duplicates['group_id'] = df_no_duplicates['jobtitle'].map(group_mapping)

# Define a function to get the updated job title for each row
def get_updated_title(title):
    return next(iter(groups[group_mapping[title]]))

# Create a new column for updated job titles
df_no_duplicates['updated_jobtitle'] = df_no_duplicates['jobtitle'].apply(get_updated_title)

# Select only the required columns for the output
df_output = df_no_duplicates[['sorted_skills', 'updated_jobtitle']].copy()

# Convert 'sorted_skills' column to tuples for checking duplicates
df_output['sorted_skills'] = df_output['sorted_skills'].apply(tuple)

# Use drop_duplicates with the subset parameter
df_output = df_output.drop_duplicates(subset=['sorted_skills', 'updated_jobtitle'], keep='first').copy()

# Convert 'sorted_skills' column back to lists
df_output['sorted_skills'] = df_output['sorted_skills'].apply(lambda x: x[0] if len(x) == 1 else tuple(x))

# Sort the DataFrame by job title
df_sorted = df_output.sort_values(by='updated_jobtitle')

# Group by 'updated_jobtitle' and concatenate 'sorted_skills' into a single tuple
df_final = df_sorted.groupby('updated_jobtitle')['sorted_skills'].agg(lambda x: tuple(sorted(set(y for sublist in x for y in sublist)))).reset_index()

# Save the modified DataFrame to a new CSV file
df_final.to_csv("/content/modified_data.csv", index=False)

print("DataFrame saved to 'modified_data.csv'")

                                               jobtitle  \
0                 Agency Manager  Senior Agency Manager   
1     Leading role for Business Development Manager ...   
2                          Business Development Manager   
3                  Branch Manager (Location  KONASEEMA)   
4          Branch Operations Manager (Location  Mulugu)   
...                                                 ...   
1375                            Territory Sales Manager   
1376   Assistant Area Sales Manager/ Area Sales Manager   
1377        Content Marketing Manager / General Manager   
1378        Content Marketing Manager / General Manager   
1379        Content Marketing Manager / General Manager   

                                                 skills  
0     ['insurance', 'sales', 'recruitment', 'agency ...  
1     ['agent recruitment', 'agency sales', 'field s...  
2     ['direct sales', 'sales', 'insurance sales', '...  
3     ['Customer Service', 'Retail Banking', 'Sales'...  
4