Converting the csv into required form for adding target course

In [4]:
import pandas as pd

# Read the CSV file into a DataFrame
input_file = '../PREP/Summary/skillset_summary.csv'  # Use the correct file path for the new file
df = pd.read_csv(input_file)

# For each employeeId, keep the row with the highest average_score
df = df.loc[df.groupby('employeeId')['average_score'].idxmax()]

# Select the relevant columns to save
df = df[['id', 'employeeId', 'Skillset', 'CourseDepartment', 'adjusted_score', 'average_score']]

# Save the result to a new CSV file
output_file = 'skillset.csv'  # Specify your desired output path
df.to_csv(output_file, index=False)

print(f"Filtered rows with highest average_score per employeeId saved to {output_file}")


Filtered rows with highest average_score per employeeId saved to skillset.csv


In [30]:
import pandas as pd

# Departments dictionary
departments = {
    'Development': ['Reactjs', 'Node', 'Next Js', 'Laravel', 'Angular', 'Flutter', 'React Native'],
    'Data Science': ['Python', 'Applied ML', 'Big Data'],
    'Data Engineering': ['Python', 'MySQL', 'Web Scraping', 'DBT', 'SnowFlake', 'Data Bricks'],
    'Cloud': ['AWS', 'Azure', 'GCP', 'Redis']
}

# Read the CSV file into a DataFrame
input_file = 'skillset.csv'  # Use the correct file path for the new file
df = pd.read_csv(input_file)

# Initialize a dictionary to keep track of skill counts per department
skill_counts = {department: {skill: 0 for skill in skills_list} for department, skills_list in departments.items()}

# Function to count skills within a department
def count_skills_in_departments(skillset, department):
    # Split the skillset string into individual skills
    skills = skillset.split(', ')
    # Count the occurrences of each skill in the corresponding department
    for skill in skills:
        if skill in departments[department]:  # Check only the current department
            skill_counts[department][skill] += 1

# Iterate through each row in the DataFrame and count skills occurrences based on department
for _, row in df.iterrows():
    department = row['CourseDepartment']
    skillset = row['Skillset']
    count_skills_in_departments(skillset, department)

# Create a new DataFrame to store the results
results = []
for department, skills in skill_counts.items():
    for skill, count in skills.items():
        results.append({'Department': department, 'Skill': skill, 'Count': count})

results_df = pd.DataFrame(results)

# Save the results to a CSV file
output_file = 'skill_counts.csv'  # Specify the output file name
results_df.to_csv(output_file, index=False)

print(f"Skill counts by department have been saved to {output_file}.")


Skill counts by department have been saved to skill_counts.csv.


In [33]:

import pandas as pd
import os

# Print current working directory
print("Current working directory:", os.getcwd())

# Read the skillset and skill counts CSV files into DataFrames
skillset_df = pd.read_csv('skillset.csv')  # Update with actual path
skill_counts_df = pd.read_csv('skill_counts.csv')  # Update with actual path

# Initialize a list to store the suggested skills
suggested_skills = []

# Iterate through each row in the skillset DataFrame
for index, row in skillset_df.iterrows():
    # Get the Skillset and CourseDepartment
    skillset = row['Skillset'].split(', ')
    department = row['CourseDepartment']
    
    # Filter the skill counts DataFrame for the corresponding department
    department_skills = skill_counts_df[skill_counts_df['Department'] == department]
    
    # Find the highest count skill that is not in the current skillset
    highest_skill = department_skills[~department_skills['Skill'].isin(skillset)].sort_values(by='Count', ascending=False).head(1)
    
    # If a skill is found, append it to the suggested skills list
    if not highest_skill.empty:
        suggested_skill = highest_skill['Skill'].values[0]
        suggested_skills.append(suggested_skill)
    else:
        suggested_skills.append('')  # If no skill found, append an empty string

# Add the suggested skills to the skillset DataFrame
skillset_df['Suggested Skill'] = suggested_skills

# Save the updated DataFrame back to CSV
skillset_df.to_csv('skillset_with_target_column.csv', index=False)




Current working directory: d:\Projects\Web Scraping\Final Project\REPORTING


In [None]:
import pandas as pd

# File paths for your CSV files
first_file_path = 'D:\Projects\Web Scraping\Final Project\PREP\Rejected Data\certification_rejected.csv'  # Update with your actual file path
second_file_path = 'D:\Projects\Web Scraping\Final Project\PREP\Rejected Data\skill_score_rejected.csv'  # Update with your actual file path

# Read data from CSV files into DataFrames
first_df = pd.read_csv(first_file_path)
second_df = pd.read_csv(second_file_path)

# Departments dictionary
departments = {
    'Development': ['Reactjs', 'Node', 'Next Js', 'Laravel', 'Angular', 'Flutter', 'React Native'],
    'Data Science': ['Python', 'Applied ML', 'Big Data'],
    'Data Engineering': ['Python', 'MySQL', 'Web Scraping', 'DBT', 'SnowFlake', 'Data Bricks'],
    'Cloud': ['AWS', 'Azure', 'GCP', 'Redis']
}

# Initialize a dictionary to hold the skill counts
rejected_skill_count = {dept: {skill: 0 for skill in skills} for dept, skills in departments.items()}

# Count rejected skills based on assessments
for _, row in second_df.iterrows():
    if row['status'] == 'Reject':
        dept = row['courseDepartment']
        skill = row['skill']
        if skill in rejected_skill_count.get(dept, {}):
            rejected_skill_count[dept][skill] += 1

# Prepare data for the output DataFrame
output_data = []
for dept, skills in rejected_skill_count.items():
    for skill, count in skills.items():
        output_data.append([dept, skill, count])

# Create the output DataFrame
output_df = pd.DataFrame(output_data, columns=['Department', 'Skill', 'Count'])

# Display the result
print(output_df)

# Optionally, save the results to a CSV file
output_df.to_csv('rejected_skill_count.csv', index=False)
