In [39]:

import pandas as pd
from collections import Counter
import random

# Load the CSV file into a DataFrame
df = pd.read_csv('DatasetRelleno.csv')

# Assuming the column that indicates the job title is named 'Job_Title', we'll filter the DataFrame
# We also assume the resumes are in a column named 'Clean_resume'
# Let's check the first few rows to understand the structure of the dataframe
df.head()

# Filter resumes for Python Developer and Java Developer
python_dev_resumes = df[df['Category'] == 'Python Developer']['Clean_Resume']
java_dev_resumes = df[df['Category'] == 'Java Developer']['Clean_Resume']

# Create sets of unique words for Python Developer and Java Developer resumes
# We will split the words in the resume and convert them to a set for each resume
# Then we will find the intersection of these sets to find common words

python_words = set()
java_words = set()

# Update the sets with words from each resume
for resume in python_dev_resumes:
    python_words.update(resume.split())

for resume in java_dev_resumes:
    java_words.update(resume.split())

# Find the intersection of words between Python Developer and Java Developer resumes
common_words = python_words.intersection(java_words)

# Convert the set to a list to display it
common_words_list = list(common_words)

# Displaying the first 20 common words for brevity
#common_words_list




# Assuming you have your DataFrames: python_dev_resumes and java_dev_resumes

# Create a Counter object for each category
python_freq_counter = Counter(" ".join(python_dev_resumes).split())
java_freq_counter = Counter(" ".join(java_dev_resumes).split())

# Get the common words
common_words = set(python_freq_counter).intersection(java_freq_counter)

# Now create a dictionary to hold the sum of frequencies from both Counters
common_words_freq = {word: python_freq_counter[word] + java_freq_counter[word] for word in common_words}

# Sort the common words by combined frequency
sorted_common_words = sorted(common_words_freq.items(), key=lambda kv: kv[1], reverse=True)

# This will give you the sorted (word, frequency) tuples
print(len(sorted_common_words))
sorted_common_words[20:40]

4828


[('manager', 445),
 ('requirement', 431),
 ('cs', 426),
 ('test', 410),
 ('report', 406),
 ('jquery', 383),
 ('control', 381),
 ('mysql', 359),
 ('microsoft', 350),
 ('es', 348),
 ('span', 343),
 ('implementation', 343),
 ('analysis', 339),
 ('end', 337),
 ('linux', 335),
 ('script', 332),
 ('computer', 329),
 ('based', 325),
 ('backup', 321),
 ('procedure', 313)]

In [40]:
# Define the replacement words for 'system' and 'server'
replacement_words_system = ['pandas', 'numpy', 'matplotlib']
replacement_words_server = ['machine learning', 'data analysis']
replacement_words_jquery = ['Automation', 'Docker']
replacement_words_python = ['python developer']

# Unified function to replace both 'system' and 'server'
def replace_keywords_with_random(text):
    words = text.split()
    return ' '.join([random.choice(replacement_words_system) if word.lower() == 'system'
                     else random.choice(replacement_words_server) if word.lower() == 'server'
                     else random.choice(replacement_words_jquery) if word.lower() == 'jquery'
                     else random.choice(replacement_words_python) if word.lower() == 'data'
                     else word
                     for word in words])

# Apply the unified function to the 'Clean_Resume' column for Python Developer resumes
df.loc[df['Category'] == 'Python Developer', 'Clean_Resume'] = df[df['Category'] == 'Python Developer']['Clean_Resume'].apply(replace_keywords_with_random)

# Save the modified DataFrame back to CSV if needed
df.to_csv('path_to_your_modified.csv', index=False)