In [80]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

df = pd.read_csv('indeed_10k.csv')
df = df.drop(columns=['Company', 'City', 'Ratings', 'Date'], axis=1)

skills_df = pd.read_csv('skills.csv', header=None)
skills_list = skills_df.iloc[0].dropna().tolist()

skills_list = [skill.strip() for skill in skills_list]
skills_dict = {skill.lower(): skill for skill in skills_list if skill.lower() != 'r'}

skills_list_sorted = sorted(skills_list, key=len, reverse=True)


def extract_skills(summary, skills_dict, skills_list_sorted):
    summary = summary.lower()
    extracted_skills = set()

    for skill in skills_list_sorted:
        skill_lower = skill.lower()
        if skill_lower in summary and skill_lower != 'r':
            extracted_skills.add(skill)
            summary = summary.replace(skill_lower, '')

    tokens = word_tokenize(summary)
    for token in tokens:
        if token in skills_dict and token != 'r':
            extracted_skills.add(skills_dict[token])

    return list(extracted_skills)


df['skills'] = df['Summary'].apply(lambda x: extract_skills(x, skills_dict, skills_list_sorted))
df = df[df['skills'].apply(lambda x: len(x) > 0)]
df.reset_index(drop=True, inplace=True)
df.to_csv('jobs_with_skills.csv', index=False)
print(df[['Name', 'skills']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                              Name                   skills
0    Entry level Software Engineer       [c, c++, java, c#]
1         Junior Software Engineer  [javascript, python, c]
2                Software Engineer      [ai, scala, c, Net]
3  Software Engineer - Entry Level                      [c]
4       Software Engineer (Python)              [python, c]


In [81]:
df.head()

Unnamed: 0,Name,Summary,skills
0,Entry level Software Engineer,"Programming experience using C#, C++, or Java ...","[c, c++, java, c#]"
1,Junior Software Engineer,Building new product features across the back ...,"[javascript, python, c]"
2,Software Engineer,Yammer-THE social network for workplace-is hir...,"[ai, scala, c, Net]"
3,Software Engineer - Entry Level,You'll be expected to deliver in an agile envi...,[c]
4,Software Engineer (Python),Comfortable developing in Python (or similar)....,"[python, c]"


In [82]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# this is the new dataset
df = pd.read_csv('jobs_with_skills.csv')


def recommend_jobs(user_skills, df, top_n=3):
    user_skills_set = set(user_skills.split(', '))

    def count_matching_skills(job_skills):
        job_skills_set = set(eval(job_skills))
        return len(user_skills_set.intersection(job_skills_set))

    # Count matching skills for each job
    df['matching_skills'] = df['skills'].apply(count_matching_skills)

    # Start with all skills and gradually reduce
    for required_skills in range(len(user_skills_set), 0, -1):
        df_filtered = df[df['matching_skills'] >= required_skills]

        if not df_filtered.empty:
            vectorizer = TfidfVectorizer()

            job_skills_tfidf = vectorizer.fit_transform(df_filtered['skills'].apply(lambda x: ' '.join(eval(x))))
            user_skills_tfidf = vectorizer.transform([user_skills])
            similarity_scores = cosine_similarity(user_skills_tfidf, job_skills_tfidf)

            df_filtered['similarity'] = similarity_scores.flatten()

            # Sort jobs by similarity score (descending)
            df_sorted = df_filtered.sort_values(by=['matching_skills', 'similarity'], ascending=[False, False])

            # Get the top N recommendations
            top_recommendations = df_sorted.head(top_n)[['Name', 'matching_skills', 'similarity', 'skills']]
            return top_recommendations

    # If no jobs found even with one skill, return empty DataFrame
    return pd.DataFrame(columns=['Name', 'matching_skills', 'similarity', 'skills'])


# Example 
user_skills = "python, data analysis, machine learning, sql, javascript, aws"
recommendations = recommend_jobs(user_skills, df)
print(f"Top recommendations for skills: {user_skills}\n")
if not recommendations.empty:
    for index, row in recommendations.iterrows():
        print(f"Job Title: {row['Name']}")
        print(
            f"Matching Skills: {row['matching_skills']} out of {len(user_skills.split(', '))}")
        print(f"Similarity Score: {row['similarity']:.2f}")
        print(f"Job Skills: {', '.join(eval(row['skills']))}")
        print("-" * 50)
else:
    print("No matching jobs found.")

Top recommendations for skills: python, data analysis, machine learning, sql, javascript, aws

Job Title: Software Engineer
Matching Skills: 4 out of 6
Similarity Score: 0.61
Job Skills: shell, sql, python, aws, c, machine learning, Git, ruby, ai
--------------------------------------------------
Job Title: Software Engineer
Matching Skills: 4 out of 6
Similarity Score: 0.61
Job Skills: shell, sql, python, aws, c, machine learning, Git, ruby, ai
--------------------------------------------------
Job Title: Software Engineer - Mobile/Back-End
Matching Skills: 4 out of 6
Similarity Score: 0.40
Job Skills: algorithms, database, python, aws, c, Object-oriented programming, javascript, machine learning, Git, rest, Go, ai, ruby, Django, Node.js, data architecture
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['similarity'] = similarity_scores.flatten()
