### Career Path Recommendations & People to Connect With

### Preprocess

Assumptions:
1. If a LinkedIn user hasn't provided their start date in the format month + year and has used only the year, we assume they started working on 01.01.YEAR. In all other cases, we assume they started working on 01.MONTH.YEAR.
2. The date used to order positions is the start date.


In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import sparknlp
from pyspark.sql.functions import col, expr, explode, when, split, length,  to_date, concat_ws, lower
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from collections import Counter
from pyspark.ml.feature import Word2Vec
from sklearn.cluster import DBSCAN
import numpy as np
from collections import Counter

In [0]:
import sparknlp
spark = sparknlp.start()
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')
companies = spark.read.parquet('/dbfs/linkedin_train_data')
profiels_followers = profiles.select("id", "followers")
levels_to_remove = ["senior", "junior", "sr", "i", "ii"]
regex_pattern = "|".join(levels_to_remove)

In [0]:
print(profiles.count())
print(companies.count())

2768313
1339790


In [0]:
companies.printSchema()

In [0]:
cols_to_drop = ['about', 'avatar', 'certifications', 'city', 'country_code', 'current_company', 'current_company:company_id', 'current_company:industry', 'current_company:name', 'education', 'educations_details', 'followers', 'following', 'groups', 'image','languages', 'people_also_viewed', 'posts', 'recommendations', 'recommendations_count', 'timestamp', 'url', 'volunteer_experience', 'courses']

profiles = profiles.drop(*cols_to_drop)
positions = profiles.select(col("id"), col("name"), explode("experience.positions").alias("position"))
positions = positions.withColumn("position", expr("filter(position, x -> x.title is not null)"))

# Keep only start_date and title in position
positions = positions.withColumn("position", expr("transform(position, x -> struct(x.start_date, x.title))"))

# Explode position array to separate rows
positions = positions.select(col("id"), col("name"), explode("position").alias("position"))

# Add columns for start_date and title
positions = positions.withColumn("start_date", col("position.start_date")).withColumn("title", col("position.title"))

# Drop row if the value in position is null
positions = positions.filter(positions.position.isNotNull())
positions.limit(100).display()

In [0]:
month_map = {
    "Jan": "1", "Feb": "2", "Mar": "3", "Apr": "4", "May": "5", "Jun": "6",
    "Jul": "7", "Aug": "8", "Sep": "9", "Oct": "10", "Nov": "11", "Dec": "12"
}

positions_df = positions.withColumn(
    "start_month", 
    when(length(col("start_date")) == 4, "1").otherwise(split(col("start_date"), " ")[0])
)

for month, num in month_map.items():
    positions_df = positions_df.withColumn(
        "start_month", 
        when(col("start_month") == month, num).otherwise(col("start_month"))
    )

positions_df = positions_df.withColumn(
    "start_year", 
    when(length(col("start_date")) == 4, col("start_date")).otherwise(split(col("start_date"), " ")[1])
)

positions_df = positions_df.withColumn("start date", to_date(concat_ws("-", col("start_year"), col("start_month"), expr("1")), "yyyy-M-d"))
drop_cols = ["start_date", "start_month", "start_year"]
positions_df = positions_df.drop(*drop_cols)
positions_df.limit(10).display()

In [0]:
result_df = (positions_df
    .groupBy("name", "id")
    .agg(
        F.sort_array(
            F.collect_list(
                F.struct(
                    F.col("start date").alias("start_date"),
                    F.col("position.title").alias("title")
                )
            ),
            asc=True
        ).alias("position_history")
    )
    .select(
        "name", 
        "id",
        F.col("position_history.title").alias("titles"),
        F.col("position_history.start_date").alias("start_dates")
    )
)

display(result_df.limit(10))

##### Select your dream job:

In [0]:
#USER SELECTIONS
#dream_job = 'data scientist'
#dream_job = 'data engineer'
#dream_job = 'hrbp'
dream_job = 'data analyst'
num_of_steps = 5

In [0]:
relevant_people = positions_df.filter(lower(col('title')).contains(dream_job)).select('name')
relevant_people = relevant_people.withColumnRenamed("name", "name_relevant")
#extract all the rows of the relevant people from positions_df
relevant_people_path = relevant_people.join(result_df, relevant_people.name_relevant == result_df.name, "inner")
relevant_people_path = relevant_people_path.drop('name_relevant')
display(relevant_people_path.limit(100))

In [0]:
# Add a column for the path to the dream job
relevant_people_path = relevant_people_path.withColumn(
    "path_to_dream_job",
    F.expr(
        f"""
        IF(
            array_position(
                transform(titles, x -> CASE WHEN lower(x) LIKE '%{dream_job.lower()}%' THEN 1 ELSE 0 END), 
                1
            ) > 0,
            slice(titles, 1, array_position(
                transform(titles, x -> CASE WHEN lower(x) LIKE '%{dream_job.lower()}%' THEN 1 ELSE 0 END), 
                1
            ) - 1),
            array()
        )
        """
    )
)

# Add a column for the number of steps before the dream job
relevant_people_path = relevant_people_path.withColumn(
    "steps_to_dream_job",
    F.expr(
        f"""
        IF(
            array_position(
                transform(titles, x -> CASE WHEN lower(x) LIKE '%{dream_job.lower()}%' THEN 1 ELSE 0 END), 
                1
            ) > 0,
            reverse(sequence(
                1, 
                array_position(
                    transform(titles, x -> CASE WHEN lower(x) LIKE '%{dream_job.lower()}%' THEN 1 ELSE 0 END), 
                    1
                ) - 1
            )),
            array()
        )
        """
    )
)

# Explode the array to create one row per step
relevant_people_path = relevant_people_path.withColumn("step_title", F.explode_outer(F.col("path_to_dream_job")))

# Add a column indicating the number of steps before the dream job for each title
relevant_people_path = relevant_people_path.withColumn(
    "steps_before_dream_job",
    F.expr(f"array_position(path_to_dream_job, step_title)")
)

# Drop rows where path_to_dream_job is an empty list
relevant_people_path = relevant_people_path.filter(F.size(F.col("path_to_dream_job")) > 0)

#drop duplicates
relevant_people_path = relevant_people_path.dropDuplicates()
# Display the resulting DataFrame

#avg_steps_df = relevant_people_path.select("step_title", "steps_before_dream_job")#.groupBy("step_title").agg(F.avg("steps_before_dream_job").alias("avg_steps_before_dream_job")
#)
relevant_people_path.printSchema()
# Do the same normalization as above to avg_steps_df
relevant_people_path = relevant_people_path.withColumn(
    "step_title",
    F.lower(  # Convert to lowercase
        F.trim(  # Remove leading and trailing spaces
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.regexp_replace(F.col("step_title"), f"(?i)[,/-]|\\b({regex_pattern})\\b", ""),  # Clean special chars
                        r"\.",  # Replace '.' with an empty string
                        ""
                    ),
                    r"@",  # Remove '@' characters
                    ""
                ),
                r"\s+",  # Replace multiple spaces with a single space
                " "
            )
        )
    )
)

avg_steps_df = relevant_people_path.select("step_title", "steps_before_dream_job")
avg_steps_df = avg_steps_df.groupBy("step_title").agg(F.avg("steps_before_dream_job").alias("avg_steps_before_dream_job"))

avg_steps_df.limit(10).display()
relevant_people_path.limit(30).display() #return 530 rows

In [0]:
# Step 1: Filter steps within 2 positions before the dream job
up2_steps = relevant_people_path.filter(F.col("steps_before_dream_job") <= num_of_steps)

# Step 2: Select and collect the step titles
up2_steps_positions_list = [row['step_title'] for row in up2_steps.collect()]

# Step 3: Count occurrences of each position
up2_steps_positions_list_counts = Counter(up2_steps_positions_list)

# Step 4: Convert counts into a DataFrame for further processing
up2_steps_positions_df = spark.createDataFrame(
    [(position, count) for position, count in up2_steps_positions_list_counts.items()],
    schema=["Position", "Count"]
)


# Step 5: Normalize the 'Position' column (cleaning and standardizing)
normalized_up2_steps_positions_df = up2_steps_positions_df.select(
    F.lower(  # Convert to lowercase
        F.trim(  # Remove leading and trailing spaces
            F.regexp_replace(
                F.regexp_replace(
                    F.regexp_replace(
                        F.regexp_replace(F.col("Position"), f"(?i)[,/-]|\\b({regex_pattern})\\b", ""),  # Clean up special chars
                        r"\.",  # Replace '.' with an empty string
                        ""
                    ),
                    r"@",  # Remove '@' characters
                    ""
                ),
                r"\s+",  # Replace multiple spaces with a single space
                " "
            )
        )
    ).alias("Position"),
    F.col("Count")
)

# Step 6: Group by the normalized 'Position' column and sum the 'Count'
grouped_positions_df = normalized_up2_steps_positions_df.groupBy("Position").agg(
    F.sum("Count").alias("Total Count")
)

# Step 7: Sort the results for better visualization
sorted_up2_steps_positions_df = grouped_positions_df.orderBy("Total Count", ascending=False)

sorted_up2_steps_positions_df = sorted_up2_steps_positions_df.join(avg_steps_df, sorted_up2_steps_positions_df.Position == avg_steps_df.step_title)

sorted_up2_steps_positions_df = sorted_up2_steps_positions_df.select("Position", "Total Count", "avg_steps_before_dream_job")

# Step 8: Display the final sorted DataFrame
sorted_up2_steps_positions_df.display()

Until here, we've extracted the positions that Linkedin users have done before they startes to work as the given dream position.
Now, we'll orgenize the data so the user will be able to infer fro it what might help him. In the first step, we would like to group similar positions into one group. For this, we are using word2vec on the position name and DBSCAN to find clusters.

In [0]:
# Remove levels and unwanted marks from the "Position" column
#positions_df = sorted_up2_steps_positions_df.select(
#    F.trim(F.regexp_replace(F.col("Position"), f"(?i)[,/-]|\\b({regex_pattern})\\b", "")).alias("Position")
#)
#convert to lower case, and drop values that are not latters
#positions_df = positions_df.withColumn("Position", F.lower(F.col("Position")))
positions_df = sorted_up2_steps_positions_df.select("Position")
positions_list = [row["Position"] for row in positions_df.collect()]
#drop duplicates
positions_list = list(set(positions_list))

# Tokenize job titles (split into words)
tokenized_positions = [[word.lower() for word in position.split()] for position in positions_list]

# Step 2: Train Word2Vec
word2vec = Word2Vec(vectorSize=70, minCount=1, inputCol="tokens", outputCol="vector")
positions_spark_df = spark.createDataFrame(pd.DataFrame({"tokens": tokenized_positions, "Position": positions_list}))
model = word2vec.fit(positions_spark_df)

# Generate embeddings for each job title
positions_with_vectors = model.transform(positions_spark_df)

# Convert embeddings to a list for clustering
embeddings = np.array(positions_with_vectors.select("vector").rdd.map(lambda x: x["vector"]).collect())

# DBSCAN
dbscan = DBSCAN(eps=0.08, min_samples=2, metric="cosine") 
clusters = dbscan.fit_predict(embeddings)

# Step 4: Add Cluster Labels
positions_with_clusters = positions_with_vectors.withColumn("Cluster", F.lit(-1))
for idx, cluster in enumerate(clusters):
    positions_with_clusters = positions_with_clusters.withColumn(
        "Cluster",
        F.when(F.col("Position") == positions_list[idx], cluster).otherwise(F.col("Cluster")),
    )
#positions_with_clusters.display()
# Step 5: Group by Cluster
positions_clusters = positions_with_clusters.groupBy("Cluster").agg(F.collect_list("Position").alias("Positions"))

#find a title, based on the 2 most common words (it the most commen word is more then twice common then the second commom words, used only the most common word)
from collections import Counter

# Define a set of stop words
STOP_WORDS = {"the", "is", "in", "and", "to", "a", "of", "on", "for", "with", "at", "by", "an", "be", "this", "that"}

def assign_cluster_title(cluster_id, positions):
    if cluster_id == -1:
        return "Other"  # Explicitly label outliers

    all_words = [word.lower() for position in positions for word in position.split()]
    filtered_words = [word for word in all_words if word not in STOP_WORDS]
    word_counts = Counter(filtered_words)
    most_common = word_counts.most_common(2)
    
    if len(most_common) == 0:
        return "Unknown"  # No positions in cluster
    
    if len(most_common) == 1 or (most_common[0][1] > 2 * most_common[1][1] and most_common[1][1] > 1):
        # Use only the most common word if it is more than twice as frequent
        return most_common[0][0]
    else:
        # Otherwise, combine the two most common words
        return f"{most_common[0][0]} {most_common[1][0]}"


# Register the function as a UDF
assign_cluster_title_udf = F.udf(assign_cluster_title)

# Apply the UDF to assign titles
positions_clusters_with_titles = positions_clusters.withColumn(
    "Cluster_Title", 
    assign_cluster_title_udf(F.col("Cluster"), F.col("Positions"))
)
#add a col that counts the number of positions in Positiond col
positions_clusters_with_titles = positions_clusters_with_titles.withColumn("Num of positions in cluster", F.size(F.col("Positions")))

# Step 7: Display Final Clusters with Titles
positions_clusters_with_titles.display()

In [0]:
# Explode the Positions array in positions_clusters_with_titles
exploded_df = positions_clusters_with_titles.select(
    F.col("Cluster"),
    F.col("Cluster_Title"),
    F.col("Positions"),
    F.col("Num of positions in cluster")
).withColumn("Exploded_Position", F.explode(F.col("Positions")))

#join exploded_df with sorted_up2_steps_positions_df sorted_up2_steps_positions_df.Position == exploded_df.Exploded_Position
joined_df = exploded_df.join(sorted_up2_steps_positions_df, F.col("Exploded_Position") == F.col("Position"), "inner")
position_title = joined_df.select("Cluster_Title", "Position")
joined_df_drop_cols = ["Cluster", "Exploded_Position", "Position"]
joined_df = joined_df.drop(*joined_df_drop_cols)

joined_df_final = joined_df.groupBy("Cluster_Title", "Positions").agg(
    F.sum("Total Count").alias("Number of Users"),
    F.avg("avg_steps_before_dream_job").alias("Average Steps Before Dream Job")
)

joined_df_final.display()


In the table above, you can see the final solution. I suggest to create a basic dashboard\ active word cloud with this info.

In the table below, there are people that can be people we are reccomending the user to connect with, based of his way to his dream job

In [0]:
# Select relevant columns
relevant_people_path = relevant_people_path.select("name", "id", "step_title", "steps_before_dream_job")

relevant_people_path = relevant_people_path.join(profiels_followers, on = "id")

relevant_people_path = relevant_people_path.join(position_title, relevant_people_path.step_title == position_title.Position, "inner")
# Display the first 10 rows
relevant_people_path.display()