# Loading Data

In [54]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from ast import literal_eval
from collections import Counter
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
df = pd.read_csv('indeed_10k.csv')
df = df.drop(columns=['Company', 'City', 'Ratings', 'Date'], axis=1)

skills_df = pd.read_csv('skills.csv', header=None)
skills_list = skills_df.iloc[0].dropna().tolist()

# Feature Extraction 

In [56]:
skills_list = [skill.strip() for skill in skills_list]
skills_dict = {skill.lower(): skill for skill in skills_list if skill.lower() not in ['r', 'c']}
skills_list_sorted = sorted(skills_list, key=len, reverse=True)


def extract_skills(summary, skills_dict, skills_list_sorted):
    summary = summary.lower()
    extracted_skills = set()

    # Look for the specific case of "c/c++" and add both "c" and "c++" to the extracted skills
    if 'c/c++' in summary:
        extracted_skills.add('c')
        extracted_skills.add('c++')
        # Remove "c/c++" from the summary
        summary = summary.replace('c/c++', '')

    # First loop to remove matching skills from the summary
    for skill in skills_list_sorted:
        skill_lower = skill.lower()
        if skill_lower in summary and skill_lower not in ['r', 'c']:
            extracted_skills.add(skill)
            summary = summary.replace(skill_lower, '')

    # Tokenize remaining summary to find more skills
    tokens = word_tokenize(summary)
    for token in tokens:
        if token in skills_dict and token not in ['r', 'c']:
            extracted_skills.add(skills_dict[token])

    return list(extracted_skills)


# Apply the extraction process to the DataFrame
df['skills'] = df['Summary'].apply(lambda x: extract_skills(x, skills_dict, skills_list_sorted))

# Filter out rows where no skills were extracted
df = df[df['skills'].apply(lambda x: len(x) > 0)]
df.reset_index(drop=True, inplace=True)

df.to_csv('jobs_with_skills.csv', index=False)

In [57]:
df = pd.read_csv('jobs_with_skills.csv')
df.head()

Unnamed: 0,Name,Summary,skills
0,Entry level Software Engineer,"Programming experience using C#, C++, or Java ...","['c#', 'c++', 'java']"
1,Junior Software Engineer,Building new product features across the back ...,"['javascript', 'python']"
2,Software Engineer,Yammer-THE social network for workplace-is hir...,"['scala', 'gin', 'Net', 'ai']"
3,Software Engineer - Entry Level,You'll be expected to deliver in an agile envi...,"['mobile', 'gin']"
4,Software Engineer (Python),Comfortable developing in Python (or similar)....,"['gin', 'python']"


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8606 entries, 0 to 8605
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     8606 non-null   object
 1   Summary  8606 non-null   object
 2   skills   8606 non-null   object
dtypes: object(3)
memory usage: 201.8+ KB


# Data Cleaning & Preprocessing

## 1- Rmove rows where skills length <= 4 

In [59]:
def convert_to_list(df, skills_column='skills'):
    """
    Convert the skills column to list format if it is in string format.
    """
    # Check if the first element is a string
    if isinstance(df[skills_column].iloc[0], str):
        # Use literal_eval to convert string representation of lists into actual lists
        df[skills_column] = df[skills_column].apply(literal_eval)


def analyze_skills_list_lengths(df, skills_column='skills'):
    """
    Analyze the lengths of skill lists in the specified column.
    """
    # Ensure the skills column contains lists
    if not isinstance(df[skills_column].iloc[0], list):
        raise ValueError(f"The '{skills_column}' column does not contain lists. Please check your data format.")

    # Count the lengths of each skill list
    skill_lengths = df[skills_column].apply(len)

    # Count the occurrences of each length
    length_distribution = Counter(skill_lengths)

    # Sort the distribution by list length
    sorted_distribution = dict(sorted(length_distribution.items()))

    return sorted_distribution


def print_skills_list_distribution(distribution):
    """
    Print the skills list length distribution in a readable format.
    """
    print("Skills List Length Distribution:")
    print("--------------------------------")
    for length, count in distribution.items():
        print(f"Lists with length {length}: {count} occurrence{'s' if count > 1 else ''}")
    print("--------------------------------")
    print(f"Total entries: {sum(distribution.values())}")


def process_skills_data(df, skills_column='skills', min_skills=5):
    """
    Process the skills data by dropping rows with skill lists <= 4.
    """
    # Ensure the skills column contains lists
    if not isinstance(df[skills_column].iloc[0], list):
        raise ValueError(f"The '{skills_column}' column does not contain lists. Please check your data format.")

    # Filter the DataFrame to keep only rows with skill lists of length > min_skills
    df_processed = df[df[skills_column].apply(len) > min_skills]

    return df_processed


# Convert the 'skills' column to lists if necessary
convert_to_list(df, skills_column='skills')

print("Original dataset:")
original_distribution = analyze_skills_list_lengths(df, skills_column='skills')
print_skills_list_distribution(original_distribution)

# Process the data to remove rows with 4 or fewer skills
df_processed = process_skills_data(df, skills_column='skills', min_skills=4)

print("\nProcessed dataset (after removing rows with 4 or fewer skills):")
processed_distribution = analyze_skills_list_lengths(df_processed, skills_column='skills')
print_skills_list_distribution(processed_distribution)

print(f"\nRows removed: {len(df) - len(df_processed)}")
print(f"Remaining rows: {len(df_processed)}")

Original dataset:
Skills List Length Distribution:
--------------------------------
Lists with length 1: 2671 occurrences
Lists with length 2: 1458 occurrences
Lists with length 3: 1453 occurrences
Lists with length 4: 1012 occurrences
Lists with length 5: 463 occurrences
Lists with length 6: 299 occurrences
Lists with length 7: 250 occurrences
Lists with length 8: 43 occurrences
Lists with length 9: 61 occurrences
Lists with length 10: 272 occurrences
Lists with length 11: 74 occurrences
Lists with length 12: 44 occurrences
Lists with length 13: 130 occurrences
Lists with length 14: 65 occurrences
Lists with length 15: 59 occurrences
Lists with length 16: 51 occurrences
Lists with length 17: 53 occurrences
Lists with length 18: 12 occurrences
Lists with length 19: 31 occurrences
Lists with length 20: 56 occurrences
Lists with length 21: 2 occurrences
Lists with length 22: 22 occurrences
Lists with length 23: 13 occurrences
Lists with length 26: 1 occurrence
Lists with length 28: 4 occ

## 2-Remove rows which target experienced people

In [60]:
# Convert 'Name' column to lowercase
df_processed['Name'] = df_processed['Name'].str.lower()

# Define the list of words to check for
keywords = ['sr', 'senior', 'principal', 'staff']

# Find rows that contain any of the keywords
mask = df_processed['Name'].str.contains('|'.join(keywords))

# Count the number of rows that contain at least one of the keywords
num_rows = mask.sum()
print(f"Number of rows containing 'sr', 'senior', or 'principal': {num_rows}")

# Drop the rows that contain these words
df_processed_filtered = df_processed[~mask]

Number of rows containing 'sr', 'senior', or 'principal': 384


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed['Name'] = df_processed['Name'].str.lower()


In [61]:
df_processed_filtered.head()

Unnamed: 0,Name,Summary,skills
24,software engineer internship,Experience developing in a high-level programm...,"[java, scala, merchant, c#, ruby, python]"
25,software engineer intern (summer 2020),Possess knowledge of software engineering proc...,"[Go, java, c++, gin, python]"
35,software engineer summer intern - java,"Fluency with Java, SQL, Perl/Python, or C++.Fa...","[scripting, java, sql, perl, c++, prototyping,..."
39,"software engineer - backend, c++",In this position you'll be mostly using Java a...,"[java, c++, c, gin, javascript, python]"
48,ml infrastructure software engineer,You have working knowledge of an OOP language ...,"[scripting, java, scala, oop, machine learning..."


In [62]:
df_processed_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1628 entries, 24 to 8601
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     1628 non-null   object
 1   Summary  1628 non-null   object
 2   skills   1628 non-null   object
dtypes: object(3)
memory usage: 50.9+ KB


## 3-Drop rows with duplicated skills

In [63]:
def are_lists_similar(list1, list2, similarity_threshold=0.8):
    """
    Compare two lists and return True if they are similar based on the threshold.
    """
    set1, set2 = set(list1), set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    jaccard_similarity = len(intersection) / len(union)
    return jaccard_similarity >= similarity_threshold


def remove_duplicate_skills(df_processed_filtered, similarity_threshold=0.8):
    """
    Remove rows with duplicate or similar skill lists and single-skill lists.
    """
    # First, remove rows where the skills list has only one item
    df_processed_filtered = df_processed_filtered[df_processed_filtered['skills'].apply(len) > 1].copy()

    # Convert skills to frozenset for hashability
    df_processed_filtered['skills_set'] = df_processed_filtered['skills'].apply(frozenset)

    # Create a dictionary to store unique skill sets
    unique_skills = {}
    rows_to_keep = []

    for index, row in df_processed_filtered.iterrows():
        skills = row['skills_set']
        is_unique = True

        # Check against existing unique skills
        for unique_skill_set in unique_skills:
            if are_lists_similar(skills, unique_skill_set, similarity_threshold):
                is_unique = False
                break

        if is_unique:
            unique_skills[skills] = index
            rows_to_keep.append(index)

    # Create a new DataFrame with only the unique rows
    df_processed_filtered_unique = df_processed_filtered.loc[rows_to_keep].copy()

    # Drop the temporary 'skills_set' column
    df_processed_filtered_unique = df_processed_filtered_unique.drop('skills_set', axis=1)

    return df_processed_filtered_unique


print("Original DataFrame:")
print(df_processed_filtered)

df_processed_filtered_unique = remove_duplicate_skills(df_processed_filtered, similarity_threshold=0.8)

Original DataFrame:
                                              Name  \
24                    software engineer internship   
25          software engineer intern (summer 2020)   
35          software engineer summer intern - java   
39                software engineer - backend, c++   
48             ml infrastructure software engineer   
...                                            ...   
8596        software development engineer ii - ios   
8597               software engineering - frontend   
8598                  software engineer - back end   
8599              infrastructure software engineer   
8601  software engineer - tools and infrastructure   

                                                Summary  \
24    Experience developing in a high-level programm...   
25    Possess knowledge of software engineering proc...   
35    Fluency with Java, SQL, Perl/Python, or C++.Fa...   
39    In this position you'll be mostly using Java a...   
48    You have working knowledge of 

In [64]:
print("\nDataFrame after removing similar skill lists and single-skill lists:")
df_processed_filtered_unique.head()


DataFrame after removing similar skill lists and single-skill lists:


Unnamed: 0,Name,Summary,skills
24,software engineer internship,Experience developing in a high-level programm...,"[java, scala, merchant, c#, ruby, python]"
25,software engineer intern (summer 2020),Possess knowledge of software engineering proc...,"[Go, java, c++, gin, python]"
35,software engineer summer intern - java,"Fluency with Java, SQL, Perl/Python, or C++.Fa...","[scripting, java, sql, perl, c++, prototyping,..."
39,"software engineer - backend, c++",In this position you'll be mostly using Java a...,"[java, c++, c, gin, javascript, python]"
48,ml infrastructure software engineer,You have working knowledge of an OOP language ...,"[scripting, java, scala, oop, machine learning..."


In [65]:
df_processed_filtered_unique.info()

<class 'pandas.core.frame.DataFrame'>
Index: 166 entries, 24 to 8300
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     166 non-null    object
 1   Summary  166 non-null    object
 2   skills   166 non-null    object
dtypes: object(3)
memory usage: 5.2+ KB


In [66]:
df_processed_filtered_unique = df_processed_filtered_unique.drop(columns=['Summary'], axis=1)
df_processed_filtered_unique.info()

<class 'pandas.core.frame.DataFrame'>
Index: 166 entries, 24 to 8300
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    166 non-null    object
 1   skills  166 non-null    object
dtypes: object(2)
memory usage: 3.9+ KB


In [67]:
df_processed_filtered_unique.to_csv('Final.csv', index=False)

In [68]:
df = pd.read_csv('Final.csv')
df.head()

Unnamed: 0,Name,skills
0,software engineer internship,"['java', 'scala', 'merchant', 'c#', 'ruby', 'p..."
1,software engineer intern (summer 2020),"['Go', 'java', 'c++', 'gin', 'python']"
2,software engineer summer intern - java,"['scripting', 'java', 'sql', 'perl', 'c++', 'p..."
3,"software engineer - backend, c++","['java', 'c++', 'c', 'gin', 'javascript', 'pyt..."
4,ml infrastructure software engineer,"['scripting', 'java', 'scala', 'oop', 'machine..."


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    166 non-null    object
 1   skills  166 non-null    object
dtypes: object(2)
memory usage: 2.7+ KB


# Recommendation System

In [70]:
def recommend_jobs(user_skills, df, top_n=5):
    user_skills_set = set(user_skills.split(', '))

    def count_matching_skills(job_skills):
        job_skills_set = set(eval(job_skills))
        return len(user_skills_set.intersection(job_skills_set))

    # Count matching skills for each job
    df['matching_skills'] = df['skills'].apply(count_matching_skills)

    # Start with all skills and gradually reduce
    for required_skills in range(len(user_skills_set), 0, -1):
        df_filtered = df[df['matching_skills'] >= required_skills]

        if not df_filtered.empty:
            vectorizer = TfidfVectorizer()

            job_skills_tfidf = vectorizer.fit_transform(df_filtered['skills'].apply(lambda x: ' '.join(eval(x))))
            user_skills_tfidf = vectorizer.transform([user_skills])
            similarity_scores = cosine_similarity(user_skills_tfidf, job_skills_tfidf)

            df_filtered['similarity'] = similarity_scores.flatten()

            # Sort jobs by similarity score (descending)
            df_sorted = df_filtered.sort_values(by=['matching_skills', 'similarity'], ascending=[False, False])

            # Get the top N recommendations
            top_recommendations = df_sorted.head(top_n)[['Name', 'matching_skills', 'similarity', 'skills']]
            return top_recommendations

    # If no jobs found even with one skill, return empty DataFrame
    return pd.DataFrame(columns=['Name', 'matching_skills', 'similarity', 'skills'])


# Example
# user_skills = "python, data analysis, machine learning, sql, javascript, aws"
user_skills = "python, django, aws"

recommendations = recommend_jobs(user_skills, df)
print(f"Top recommendations for skills: {user_skills}\n")
if not recommendations.empty:
    for index, row in recommendations.iterrows():
        print(f"Job Title: {row['Name']}")
        print(f"Matching Skills: {row['matching_skills']} out of {len(user_skills.split(', '))}")
        print(f"Similarity Score: {row['similarity']:.2f}")
        print(f"Job Skills: {', '.join(eval(row['skills']))}")
        print("-" * 50)
else:
    print("No matching jobs found.")

Top recommendations for skills: python, django, aws

Job Title: software engineer in test (swit)
Matching Skills: 3 out of 3
Similarity Score: 0.52
Job Skills: docker, aws, apache airflow, django, nodejs, python
--------------------------------------------------
Job Title: software engineer - mobile/back-end
Matching Skills: 3 out of 3
Similarity Score: 0.27
Job Skills: scripting, Go, gin, aws, data architecture, Object-oriented programming, Node.js, ai, django, algorithms, ruby, machine learning, Git, rest, database, javascript, python
--------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['similarity'] = similarity_scores.flatten()
