In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import numpy as np

file_path = 'output-removed.csv' 
job_data = pd.read_csv(file_path)

skill_columns = ['Python', 'Java', 'C++', 'SQL', 'HTML', 'CSS', 'JavaScript', 'React', 
                 'Git', 'Agile', 'Machine Learning', 'Operating Systems', 'Version Control', 
                 'Cloud Platforms', 'Containerization', 'Data Structures & Algorithms', 
                 'API Development', 'Microservices Architecture', 'Cybersecurity', 'Big Data', 
                 'CI/CD Pipelines']

job_data[skill_columns] = job_data[skill_columns].replace({'Yes': 1, 'No': 0})

print(job_data[skill_columns].dtypes)
print(job_data[skill_columns].head())

user_data = pd.DataFrame({
    'User ID': [1, 2, 3],
    'Python': [1, 0, 1],
    'Java': [0, 1, 1],
    'C++': [0, 0, 1],
    'SQL': [1, 1, 1],
    'HTML': [1, 0, 0],
    'CSS': [1, 0, 0],
    'JavaScript': [1, 0, 1],
    'React': [1, 0, 1],
    'Git': [1, 1, 1],
    'Agile': [1, 1, 1],
    'Machine Learning': [1, 0, 1],
    'Operating Systems': [1, 1, 1],
    'Version Control': [1, 1, 1],
    'Cloud Platforms': [1, 0, 1],
    'Containerization': [0, 0, 1],
    'Data Structures & Algorithms': [1, 1, 1],
    'API Development': [0, 1, 1],
    'Microservices Architecture': [0, 1, 1],
    'Cybersecurity': [0, 1, 1],
    'Big Data': [0, 1, 1],
    'CI/CD Pipelines': [1, 1, 1]
})

combined_data = pd.concat([job_data[skill_columns], user_data[skill_columns]])

scaler = StandardScaler()
combined_data_scaled = scaler.fit_transform(combined_data)

job_data_scaled = combined_data_scaled[:len(job_data)]
user_data_scaled = combined_data_scaled[len(job_data):]

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(job_data_scaled)

top_n = 5

for i, user in user_data.iterrows():
    user_skills_scaled = user_data_scaled[i].reshape(1, -1)
    distances, job_indices = knn.kneighbors(user_skills_scaled, n_neighbors=top_n)
    
    print(f"Top {top_n} jobs for User {user['User ID']} (Skills: {user[skill_columns].values}):")
    for idx, dist in zip(job_indices[0], distances[0]):
        print(f"- {job_data.iloc[idx]['Job Title']} (Distance: {dist:.4f})")

user_true_jobs = {
    1: 'Software Developer',  # User 1's correct job
    2: 'Backend Developer',   # User 2's correct job
    3: 'Data Analyst'         # User 3's correct job
}

correct_predictions = 0

for i, user in user_data.iterrows():
    user_skills_scaled = user_data_scaled[i].reshape(1, -1)
    distances, job_indices = knn.kneighbors(user_skills_scaled, n_neighbors=top_n)
    
    top_jobs = job_data.iloc[job_indices[0]]['Job Title'].values
    true_job = user_true_jobs.get(user['User ID'])
    if true_job in top_jobs:
        correct_predictions += 1

accuracy = (correct_predictions / len(user_data)) * 100
#print(f"\nAccuracy of job prediction using KNN: {accuracy:.2f}%")


Python                          int64
Java                            int64
C++                             int64
SQL                             int64
HTML                            int64
CSS                             int64
JavaScript                      int64
React                           int64
Git                             int64
Agile                           int64
Machine Learning                int64
Operating Systems               int64
Version Control                 int64
Cloud Platforms                 int64
Containerization                int64
Data Structures & Algorithms    int64
API Development                 int64
Microservices Architecture      int64
Cybersecurity                   int64
Big Data                        int64
CI/CD Pipelines                 int64
dtype: object
   Python  Java  C++  SQL  HTML  CSS  JavaScript  React  Git  Agile  ...  \
0       1     1    1    1     1    1           1      0    1      1  ...   
1       1     0    0    1     1    1

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def load_and_prepare_data(file_path, skill_columns):
    """
    Load job data from a CSV file and prepare it by ensuring skill columns are numerical.

    :param file_path: Path to the CSV file containing job data
    :param skill_columns: List of skill column names
    :return: DataFrame with prepared job data
    """

    job_data = pd.read_csv(file_path)
    
    for column in skill_columns:
        if column in job_data.columns:
            job_data[column] = job_data[column].replace({'Yes': 1, 'No': 0})
            job_data[column] = pd.to_numeric(job_data[column], errors='coerce').fillna(0).astype(int)
    
    return job_data

def recommend_jobs(user_skills, job_data, skill_columns, top_n=5):
    """
    Recommend jobs based on user-provided skills.

    :param user_skills: List of skills known by the user (e.g., ['Python', 'Java'])
    :param job_data: DataFrame containing job skill requirements
    :param skill_columns: List of skill column names in the job data
    :param top_n: Number of top job recommendations to return
    :return: DataFrame of recommended jobs
    """
    user_skill_vector = np.zeros(len(skill_columns))
    for skill in user_skills:
        if skill in skill_columns:
            user_skill_vector[skill_columns.index(skill)] = 1

    job_skill_vectors = job_data[skill_columns].values
    similarity_scores = cosine_similarity([user_skill_vector], job_skill_vectors).flatten()

    job_data['Similarity'] = similarity_scores
    recommended_jobs = job_data.sort_values(by='Similarity', ascending=False).head(top_n)

    return recommended_jobs[['Job Title', 'Similarity']]

def main():

    file_path = 'output-removed.csv'

    skill_columns = ['Python', 'Java', 'C++', 'SQL', 'HTML', 'CSS', 'JavaScript', 'React', 
                     'Git', 'Agile', 'Machine Learning', 'Operating Systems', 'Version Control', 
                     'Cloud Platforms', 'Containerization', 'Data Structures & Algorithms', 
                     'API Development', 'Microservices Architecture', 'Cybersecurity', 'Big Data', 
                     'CI/CD Pipelines']

    job_data = load_and_prepare_data(file_path, skill_columns)
    user_provided_skills = ['Python', 'Java' , 'SQL']
    recommended_jobs = recommend_jobs(user_provided_skills, job_data, skill_columns, top_n=5)

    print("Recommended Jobs:")
    print(recommended_jobs)

if __name__ == "__main__":
    main()


Recommended Jobs:
                      Job Title  Similarity
383  Data Governance Specialist    0.654654
377            Technical Writer    0.654654
423      IT Training Specialist    0.654654
440      IT Performance Analyst    0.654654
296          Systems Programmer    0.577350


In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def load_and_prepare_data(file_path, skill_columns):
    """
    Load job data from a CSV file and prepare it by ensuring skill columns are numerical.

    :param file_path: Path to the CSV file containing job data
    :param skill_columns: List of skill column names
    :return: DataFrame with prepared job data
    """
    job_data = pd.read_csv(file_path)
    
    for column in skill_columns:
        if column in job_data.columns:
            job_data[column] = job_data[column].replace({'Yes': 1, 'No': 0})
            job_data[column] = pd.to_numeric(job_data[column], errors='coerce').fillna(0).astype(int)
    
    return job_data

def recommend_jobs(user_skills, job_data, skill_columns, top_n=5):
    """
    Recommend jobs based on user-provided skills.

    :param user_skills: List of skills known by the user (e.g., ['Python', 'Java'])
    :param job_data: DataFrame containing job skill requirements
    :param skill_columns: List of skill column names in the job data
    :param top_n: Number of top job recommendations to return
    :return: DataFrame of recommended jobs
    """
    user_skill_vector = np.zeros(len(skill_columns))
    for skill in user_skills:
        if skill in skill_columns:
            user_skill_vector[skill_columns.index(skill)] = 1


    job_skill_vectors = job_data[skill_columns].values
    similarity_scores = cosine_similarity([user_skill_vector], job_skill_vectors).flatten()

    job_data['Similarity'] = similarity_scores
    recommended_jobs = job_data.sort_values(by='Similarity', ascending=False).head(top_n)

    return recommended_jobs[['Job Title', 'Similarity']]

def main():
    file_path = 'output-removed.csv'

    skill_columns = ['Python', 'Java', 'C++', 'SQL', 'HTML', 'CSS', 'JavaScript', 'React', 
                     'Git', 'Agile', 'Machine Learning', 'Operating Systems', 'Version Control', 
                     'Cloud Platforms', 'Containerization', 'Data Structures & Algorithms', 
                     'API Development', 'Microservices Architecture', 'Cybersecurity', 'Big Data', 
                     'CI/CD Pipelines']

    job_data = load_and_prepare_data(file_path, skill_columns)

    user_provided_skills = ['Python' , 'Java' , 'C++' , 'HTML']

    recommended_jobs = recommend_jobs(user_provided_skills, job_data, skill_columns, top_n=20)

    print("Recommended Jobs:")
    print(recommended_jobs)

if __name__ == "__main__":
    main()


Recommended Jobs:
                   Job Title  Similarity
1462      Frontend Developer    0.707107
1469    Full Stack Developer    0.707107
1300    Mobile App Developer    0.670820
689          DevOps Engineer    0.666667
828           Data Scientist    0.666667
676     System Administrator    0.666667
967   Database Administrator    0.666667
761     Mobile App Developer    0.632456
987    Cybersecurity Analyst    0.632456
1413    Mobile App Developer    0.632456
730    Cybersecurity Analyst    0.632456
636       Frontend Developer    0.612372
1480         DevOps Engineer    0.612372
905     Mobile App Developer    0.603023
874       Frontend Developer    0.603023
869    Cybersecurity Analyst    0.603023
1032          Cloud Engineer    0.603023
1047          Cloud Engineer    0.603023
1103    System Administrator    0.603023
1122    System Administrator    0.603023
