preperations

In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/linkedin/people')
profiles.display()

In [0]:
profiles1 = profiles.select('id','about','education','experience','сourses','url')
profiles1.display()


In [0]:
profiles1.display()

changing the experience field to be a list of career titles, in chronological order 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, collect_list, upper, struct, sort_array, expr
from functools import reduce

# Assuming spark is your SparkSession
spark = SparkSession.builder.appName("sorted_data_titles").getOrCreate()

# Load your DataFrame (df is your existing DataFrame)
# df = ...

# Explode the experience column to work with individual elements
exploded_df = profiles1.withColumn("exploded_experience", explode("experience"))

# Filter out experiences with titles containing "data"
keywords = ["machine learning" , "artificial intelligence", "data"]

# Create a dynamic filter expression that includes all keywords
filter_expression = reduce(lambda a, b: a | b, 
                           [upper(col("exploded_experience.title")).like(f"%{keyword.upper()}%") for keyword in keywords])

filtered_df = exploded_df.filter(filter_expression)

# Select the ID and structure the experiences with title and start_date for sorting
# Assume start_date is in a sortable format (e.g., YYYY-MM-DD)
structured_df = filtered_df.select(
    "id",
    struct(
        col("exploded_experience.title"),
        col("exploded_experience.start_date").alias("sort_key")
    ).alias("structured_experience")
)

# Group by ID, aggregate structured experiences, sort them by start_date, and extract titles
final_df = structured_df.groupBy("id").agg(
    sort_array(collect_list("structured_experience"), asc=True).alias("sorted_experiences")
).select(
    "id",
    # Use a transform function to map over sorted_experiences and extract only the title from each struct
    # This step requires Spark 3.0+ for the transform function
    expr("transform(sorted_experiences, exp -> exp.title) as sorted_titles")
)

profiles_with_careers = profiles1.join(final_df, on='id', how='inner')

# Display the resulting DataFrame
profiles_with_careers.display()

changing the education field to be a list of education fields, in chronological order

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, collect_list, upper, struct, sort_array, expr

# Assuming spark is your SparkSessio
# Load your DataFrame (df is your existing DataFrame)
# df = ...

# Explode the experience column to work with individual elements
exploded_df1 = profiles1.withColumn("exploded_education", explode("education"))

# Select the ID and structure the experiences with title and start_date for sorting
# Assume start_date is in a sortable format (e.g., YYYY-MM-DD)
structured_df1 = exploded_df1.select(
    "id",
    struct(
        col("exploded_education.field"),
        col("exploded_education.end_year").alias("sort_key")
    ).alias("structured_education")
)

# Group by ID, aggregate structured experiences, sort them by start_date, and extract titles
final_df1 = structured_df1.groupBy("id").agg(
    sort_array(collect_list("structured_education"), asc=True).alias("sorted_education")
).select(
    "id",
    # Use a transform function to map over sorted_experiences and extract only the title from each struct
    # This step requires Spark 3.0+ for the transform function
    expr("transform(sorted_education, exp -> exp.field) as sorted_fields")
)

profiles_with_careers = profiles_with_careers.join(final_df1, on='id', how='left')

# Display the resulting DataFrame
profiles_with_careers.display()

changing the courses field to be a list of course titles, in chronological order

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, collect_list, upper, struct, sort_array, expr

# Assuming spark is your SparkSessio
# Load your DataFrame (df is your existing DataFrame)
# df = ...

# Explode the experience column to work with individual elements
exploded_df1 = profiles1.withColumn("exploded_courses", explode("сourses"))

# Select the ID and structure the experiences with title and start_date for sorting
# Assume start_date is in a sortable format (e.g., YYYY-MM-DD)
structured_df1 = exploded_df1.select(
    "id",
    struct(
        col("exploded_courses.title"),
    ).alias("structured_courses")
)

# Group by ID, aggregate structured experiences, sort them by start_date, and extract titles
final_df1 = structured_df1.groupBy("id").agg(
    sort_array(collect_list("structured_courses"), asc=True).alias("sorted_courses")
).select(
    "id",
    # Use a transform function to map over sorted_experiences and extract only the title from each struct
    # This step requires Spark 3.0+ for the transform function
    expr("transform(sorted_courses, exp -> exp.title) as sorted_courses")
)

profiles_with_careers = profiles_with_careers.join(final_df1, on='id', how='left')

# Display the resulting DataFrame
profiles_with_careers.display()

In [0]:
profiles_with_careers = profiles_with_careers.drop("education" , "experience" , "сourses")

profiles_with_careers = profiles_with_careers \
    .withColumnRenamed("sorted_fields","education") \
    .withColumnRenamed("sorted_courses","courses") \
    .withColumnRenamed("sorted_titles","experience")

profiles_with_careers.display()

id,about,url,experience,education,courses
%e2%80%9dmichael-w-svoboda%e2%80%9d-6434533,"shipping/ups/postal some printer/copier repair on the spot helper Specialties: mail room, friendship,",https://www.linkedin.com/in/%e2%80%9dmichael-w-svoboda%e2%80%9d-6434533,"List(Data Communications Manager / Operations Liaison, data com mgr.)","List(null, school)",
1jasonriley,"As an analytics management specialist, I am responsible for overseeing various aspects of data analysis and reporting. I work closely with different lines of business to identify informational needs and develop specific data requirements that can provide insight and support decision-making. This includes determining the necessary capabilities, tools, and data to effectively leverage our analytics systems and ecosystem to support the needs of our business partners and various initiatives and projects. Additionally, I prepare presentations and other materials to help communicate the results of our analysis and support overall business strategy. Expertise: Data Science, SQL, Database Development, Brokerage, Banking, Wealth Management, Fraud Analytics, Audit Analytics, Data Analytics and Reporting, Risk Management, Operational Risk, Mortgage Fraud, Fraud Prevention, Money Movement, SAS Enterprise Guide, Large Scale Project Management, Acquisitions and Mergers.",https://www.linkedin.com/in/1jasonriley,List(Marketing Database Decision Strategist),"List(null, null, null)",
777bobcarroll,"Although retired for many years, I am open to a return to the active workforce, possibly working for the federal government overseas. My background is primarily IT. However, I am likewise open to other potential employment where my skills will be useful. I was an IT professional with a solid background as systems analyst, applications & systems programmer, and systems administrator in a mixed UNIX + Windows environment, relied upon as an effective troubleshooter of hardware, software, and networking. I had extensive experience in technical support, research, web development, database (Oracle/MySQL), multi-platform integration, and technical & customer documentation. I had supervisory and administrative experience, a strong work ethic, plus good interpersonal and writing skills. I have ability to organize complex systems, master details, but remain clearly focused on the big picture.",https://www.linkedin.com/in/777bobcarroll,List(Data Processing Manager (GS-12)),"List(Computer Science, GPA 4.0, Psychology, minors: Journalism, English)",
Navyayeravelli5798,,https://www.linkedin.com/in/Navyayeravelli5798?_l=en_us,List(Data Analyst),"List(null, Electronics and communication engineering)",
a-rister,"Bioinformatics chemist with mass spectrometry and systems biology experience who enjoys finding the story in big biological data using Python, Excel, R and SQL.",https://www.linkedin.com/in/a-rister,List(Data Interpretation Chemist),"List(Chemistry, Chemistry, Sexuality and Gender Studies)",
a-vinod-ramarao-569a3b12a,,https://www.linkedin.com/in/a-vinod-ramarao-569a3b12a,List(Data Science Manager),,
aakdag,"Analyst with in-depth experience in finance and tech. Enjoy translating raw data into consumable information to various stakeholders across the organization and be the bridge between departments. Design and implement business processes, import data into modeling templates, create and maintain database tables. Continuously learn new technologies and never stop being passionate about data. Expert level knowledge in Excel, SQL, Oracle, Power BI, Great Plains, Perfect Law, NetFORUM, Deltek, Salesforce, etc.",https://www.linkedin.com/in/aakdag,List(Business Applications Data Analyst),"List(null, null, null)","List(Scraping and Datamining, Six Sigma Yellow Belt Training)"
aanchal-k-66bbb081,"I am a lead data engineer at LTI Mindtree. I collaborate cross-functionally to design, develop, and deploy data models, pipelines, and dashboards that inform data-driven decision-making. Specialties: Experienced in the design and architecture of Data Warehouses and OLAP systems, multi-dimensional and relational data modeling, Extraction-Transformation-Load (ETL) processes, and data model review and database performance tuning. In-depth knowledge of processes and systems involved in the software development life cycle (SDLC). This includes proper use of version control systems and the creation of software build/installation/deployment best practices. I possess a robust academic foundation, holding a Bachelor of Technology (B Tech) degree from the esteemed National Institute of Technology (NIT) Kurukshetra.",https://www.linkedin.com/in/aanchal-k-66bbb081,List(Lead Data Engineer),"List(Electrical and Electronics Engineering, Operations Research)","List(Bayesian Statistics- Coursera, Intro to SQL for Data Science - Data Camp, Statistical Learning - Stanford Online)"
aarash-heydari-bb7105129,"Concerning technology, Heidegger quoted Hölderlin: “But where the danger is, also grows the saving power.”",https://www.linkedin.com/in/aarash-heydari-bb7105129,List(Deep Learning / Big Data Researcher),List(Computer Science),
aaron-guiggey-cpa-403a90106,"Experienced Certified Public Accountant with a demonstrated history of working in the accounting industry. Skilled in tax preparation, tax planning and preparation, audits, reviews, compilations, bookkeeping and payroll services.",https://www.linkedin.com/in/aaron-guiggey-cpa-403a90106,List(Broadcast Database Transcriber and Electronic Reader),"List(Accounting, Botany/Plant Biology)",


In [0]:
pandas_df = profiles_with_careers.toPandas()
pandas_df

Unnamed: 0,id,about,url,experience,education,courses
0,%e2%80%9dmichael-w-svoboda%e2%80%9d-6434533,shipping/ups/postal some printer/copier repair...,https://www.linkedin.com/in/%e2%80%9dmichael-w...,[Data Communications Manager / Operations Liai...,"[None, school]",
1,1jasonriley,"As an analytics management specialist, I am re...",https://www.linkedin.com/in/1jasonriley,[Marketing Database Decision Strategist],"[None, None, None]",
2,777bobcarroll,"Although retired for many years, I am open to ...",https://www.linkedin.com/in/777bobcarroll,[Data Processing Manager (GS-12)],"[Computer Science, GPA 4.0, Psychology, minors...",
3,Navyayeravelli5798,,https://www.linkedin.com/in/Navyayeravelli5798...,[Data Analyst],"[None, Electronics and communication engineering]",
4,a-rister,Bioinformatics chemist with mass spectrometry ...,https://www.linkedin.com/in/a-rister,[Data Interpretation Chemist],"[Chemistry, Chemistry, Sexuality and Gender St...",
...,...,...,...,...,...,...
31883,zhuoli-cai-864241230,,https://www.linkedin.com/in/zhuoli-cai-864241230,"[Content Marketing Data Analyst Group Intern, ...","[Business Data Analyst, Military, business dat...",
31884,zhuyun-chen-276572130,Master of Computer Science at Boston Universit...,https://www.linkedin.com/in/zhuyun-chen-276572130,[Machine Learning Engineer],"[Computer Science, computer science & psychology]",
31885,ziqi-zhong,This is Ziqi! I am a graduate student at Carna...,https://www.linkedin.com/in/ziqi-zhong,"[Data Analyst - Capstone Project, Data Science...","[International Relations, International Relati...","[Advanced Business Analytics, Advanced Relatio..."
31886,zknights,,https://www.linkedin.com/in/zknights,[Data Architect - Team Lead],"[None, Chemical Engineering]",


data of people we know, will be used as examples for evaluation

In [0]:
import pandas as pd

file='/Workspace/Users/alonaricha@campus.technion.ac.il/data_profiles_test.csv'

# Convert the list of rows to a Pandas DataFrame
test_df = pd.read_csv(file)

# Display the new DataFrame
test_df

Unnamed: 0,id,about,education,experience,courses,q1,q2,q3,q4
0,212779797,Data science and engineering Student at Techni...,Data science,data engineer,,yes,yes,no,no
1,212708754,data science & engineering student,Data science,intern data analyst,,no,no,yes,yes
2,212777676,data science & engineering student,Data science,student data engineer,,yes,no,no,yes
3,213376551,industrial engineering student,industrial engineering,intern data analyst,,no,no,yes,yes
4,213101819,Data science and engineering master,data science,intern data scientist,,yes,yes,yes,no


generating careers using GPT's API (Gemini)

In [0]:
!pip install -q -U google-generativeai

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

api_key = "AIzaSyDbyUmQ5f4NAKqrxbUujMtc3xi56Jz0P9s"
genai.configure(api_key=api_key)

api_model = genai.GenerativeModel('gemini-pro')

In [0]:
import pandas as pd
import csv

all_results=[]

# Assuming 'final_df' is your DataFrame with a column named 'column'
for idx, row in test_df.iterrows():
    # Construct the query string using the value from the specified column
    query = f"I have someone with job history of {row['experience']} and also we asked them a few questions with those answers:\n"\
            f"When considering your career, do you like to explore new directions? {row['q1']},\n"\
            f"Are you comfortable taking risks in your career to pursue new opportunities? {row['q2']},\n"\
            f"Do you like working as part of a team in your career? {row['q3']},\n"\
            f"Do you enjoy mentoring and guiding others? {row['q4']},\n"\
            "what 3 career paths should they explore?\n"\
            "the answer format should be 'career1,career2,career3'"
    
    # Generate content based on the query
    response = api_model.generate_content(query)

    # Extract the generated text from the response object
    generated_text = to_markdown(response.text).data.lstrip(">")
    
    # Parse the response to extract career paths
    career_paths = generated_text.split(',')[:3]  # Assuming API returns career paths separated by semicolons
    
    # # Append the result to the list of all results
    all_results.append({'id': idx, 'career1': career_paths[0], 'career2':career_paths[1], 'career3':career_paths[2]})

results_df = pd.DataFrame(all_results)

# Set the index of the results_df to match the 'id' column
results_df.set_index('id', inplace=True)

# Concatenate final_df with results_df
final_df_with_results = pd.concat([test_df, results_df], axis=1)

evaluation df after concatinating it with careers from the GPT's generation

In [0]:
final_df_with_results

Unnamed: 0,id,about,education,experience,courses,q1,q2,q3,q4,career1,career2,career3
0,212779797,Data science and engineering Student at Techni...,Data science,data engineer,,yes,yes,no,no,Data Scientist,Machine Learning Engineer,Software Engineer
1,212708754,data science & engineering student,Data science,intern data analyst,,no,no,yes,yes,Data Analytics Manager,Data Scientist,Business Analyst
2,212777676,data science & engineering student,Data science,student data engineer,,yes,no,no,yes,Data Scientist,Data Analyst,Machine Learning Engineer
3,213376551,industrial engineering student,industrial engineering,intern data analyst,,no,no,yes,yes,Data Engineer,Data Scientist,Software Engineer
4,213101819,Data science and engineering master,data science,intern data scientist,,yes,yes,yes,no,Data Engineer,Machine Learning Engineer,Data Analyst


filtering data for scraping and training

In [0]:
new_pandas=pandas_df[pandas_df['experience'].apply(lambda x: len(x) > 1)]
new_pandas

Unnamed: 0,id,about,url,experience,education,courses
0,%e2%80%9dmichael-w-svoboda%e2%80%9d-6434533,shipping/ups/postal some printer/copier repair...,https://www.linkedin.com/in/%e2%80%9dmichael-w...,[Data Communications Manager / Operations Liai...,"[None, school]",
13,aaron-watkins-57057821,Forging Tangible Business Strategies by Unifyi...,https://www.linkedin.com/in/aaron-watkins-5705...,"[Deepwater Mud logging/ Data engineer, Documen...","[None, None, Business Administration & Law, Ge...",
18,aatif-momin-13b2161a5,16 plus years of experience working in the Fin...,https://www.linkedin.com/in/aatif-momin-13b2161a5,"[Lead Business / Data Analyst, Senior Business...","[Computer Science, Information Technology, Sci...",
24,abby-liu,,https://www.linkedin.com/in/abby-liu,"[Capstone Data Analyst, Data Analyst Intern, P...",[Business Administration and Management; Econo...,
25,abbyoverby,"Data, Analytics and BI Engineer with a backgro...",https://www.linkedin.com/in/abbyoverby/,"[Data & Analytics Consultant, Data Analyst, En...","[None, None, None]",
...,...,...,...,...,...,...
31878,zal-patel-771a24281,,https://www.linkedin.com/in/zal-patel-771a24281,"[Data Analyst, Data Analyst]","[None, Information Technology]",
31880,zenon-cuellar-0a819337,Professional Database Developer moving towards...,https://www.linkedin.com/in/zenon-cuellar-0a81...,"[Database Developer, Database Developer, Senio...","[Computer Science, Minor in Mathematics]",
31881,zhihui-v-chen,"Passionate on AI (NLP, Robotics). Ex-Googler, ...",https://www.linkedin.com/in/zhihui-v-chen,"[Director of Artificial Intelligence, Head of ...","[None, Computer Science]",
31883,zhuoli-cai-864241230,,https://www.linkedin.com/in/zhuoli-cai-864241230,"[Content Marketing Data Analyst Group Intern, ...","[Business Data Analyst, Military, business dat...",


In [0]:
list=new_pandas['experience']
list

Out[161]: 0        [Data Communications Manager / Operations Liai...
13       [Deepwater Mud logging/ Data engineer, Documen...
18       [Lead Business / Data Analyst, Senior Business...
24       [Capstone Data Analyst, Data Analyst Intern, P...
25       [Data & Analytics Consultant, Data Analyst, En...
                               ...                        
31878                         [Data Analyst, Data Analyst]
31880    [Database Developer, Database Developer, Senio...
31881    [Director of Artificial Intelligence, Head of ...
31883    [Content Marketing Data Analyst Group Intern, ...
31885    [Data Analyst - Capstone Project, Data Science...
Name: experience, Length: 6488, dtype: object

In [0]:
flattened_list = [item for sublist in list for item in sublist]
flattened_list

Out[162]: ['Data Communications Manager / Operations Liaison',
 'data com mgr.',
 'Deepwater Mud logging/ Data engineer',
 'Document Control & Data Management',
 'Lead Business / Data Analyst',
 'Senior Business / Data Analyst',
 'Capstone Data Analyst',
 'Data Analyst Intern',
 'Project Data Analyst',
 'Senior Data Analyst',
 'Data & Analytics Consultant',
 'Data Analyst, Engineering',
 'Data Engineer',
 'Senior Data Engineer',
 'Data Innovation Project Manager',
 'Project Manager - Data Tech',
 'Data Science Intern',
 'Senior Data Scientist',
 'Data Scientist Intern',
 'Mergers & Acquisitions Data Analyst Intern',
 'Principal Data Scientist',
 'Data Analyst',
 'Machine Learning Engineer',
 'Client Data Processor',
 'Data Conversion Operator',
 'Data Analyst',
 'Data Analyst',
 'Data Science',
 'Machine Learning Scientist',
 'Data Management Summer Intern',
 'Data Management/Analysis Summer Intern',
 'Data Operations Intern',
 'Data Science Intern',
 'Data Engineering Intern',
 'Faceb

In [0]:
from collections import Counter
value_counts = Counter(flattened_list)
filtered_items = [item for item, count in value_counts.items() if count >= 2]
print(value_counts)

Counter({'Data Analyst': 1464, 'Data Scientist': 829, 'Data Engineer': 703, 'Database Administrator': 384, 'Data Science Intern': 351, 'Data Analyst Intern': 233, 'Machine Learning Engineer': 193, 'Data Entry Clerk': 193, 'Senior Data Scientist': 182, 'Senior Data Engineer': 175, 'Senior Data Analyst': 150, 'Data Entry Specialist': 126, 'Data Architect': 118, 'Senior Database Administrator': 115, 'Database Developer': 106, 'Data Entry': 93, 'Oracle Database Administrator': 88, 'Machine Learning Intern': 84, 'Data Manager': 80, 'Data Specialist': 76, 'Data Scientist Intern': 75, 'Big Data Engineer': 74, 'Data Analytics Intern': 66, 'Database Analyst': 58, 'Data Science Fellow': 57, 'Database Consultant': 55, 'Database Engineer': 53, 'Data Entry Operator': 53, 'Business Data Analyst': 51, 'Big Data Developer': 50, 'Data Consultant': 50, 'Senior Data Architect': 49, 'Database Manager': 47, 'Clinical Data Manager': 44, 'Data Science Consultant': 43, 'Lead Data Scientist': 41, 'Data Enginee

In [0]:
new_pandas['experience'] = new_pandas['experience'].apply(lambda x: [item for item in x if item in filtered_items])
new_pandas = new_pandas[new_pandas['experience'].apply(lambda x: len(x)>1)]
new_pandas


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pandas['experience'] = new_pandas['experience'].apply(lambda x: [item for item in x if item in filtered_items])


Unnamed: 0,id,about,url,experience,education,courses
13,aaron-watkins-57057821,Forging Tangible Business Strategies by Unifyi...,https://www.linkedin.com/in/aaron-watkins-5705...,"[Deepwater Mud logging/ Data engineer, Documen...","[None, None, Business Administration & Law, Ge...",
24,abby-liu,,https://www.linkedin.com/in/abby-liu,"[Data Analyst Intern, Senior Data Analyst]",[Business Administration and Management; Econo...,
25,abbyoverby,"Data, Analytics and BI Engineer with a backgro...",https://www.linkedin.com/in/abbyoverby/,"[Data & Analytics Consultant, Data Engineer, S...","[None, None, None]",
30,abhinav-garg-7841ab33,,https://www.linkedin.com/in/abhinav-garg-7841ab33,"[Data Science Intern, Senior Data Scientist]","[None, Industrial Engineering (Advanced Analyt...","[Advanced Stochastic Processes, Algorithms for..."
31,abhinavsingh9,,https://www.linkedin.com/in/abhinavsingh9,"[Data Scientist Intern, Principal Data Scientist]","[None, Business Analytics, Computational Biology]",
...,...,...,...,...,...,...
31863,yuchuanhan,Equipped with multidisciplinary knowledge of s...,https://www.linkedin.com/in/yuchuanhan,"[Data Analyst, Data Analyst, Machine Learning ...","[None, None, None]",
31864,yue-wang-a06b27132,,https://www.linkedin.com/in/yue-wang-a06b27132,"[Data Analyst, Data Science Intern]","[Computer Science, Data Science and Applied St...",
31878,zal-patel-771a24281,,https://www.linkedin.com/in/zal-patel-771a24281,"[Data Analyst, Data Analyst]","[None, Information Technology]",
31880,zenon-cuellar-0a819337,Professional Database Developer moving towards...,https://www.linkedin.com/in/zenon-cuellar-0a81...,"[Database Developer, Database Developer]","[Computer Science, Minor in Mathematics]",


In [0]:
list=new_pandas['experience']
flattened_list = [item for sublist in list for item in sublist]
from collections import Counter
value_counts = Counter(flattened_list)
print(value_counts)


Counter({'Data Analyst': 1226, 'Data Scientist': 728, 'Data Engineer': 657, 'Database Administrator': 341, 'Data Science Intern': 298, 'Data Analyst Intern': 196, 'Machine Learning Engineer': 174, 'Senior Data Engineer': 166, 'Senior Data Scientist': 163, 'Data Entry Clerk': 139, 'Senior Data Analyst': 123, 'Data Architect': 108, 'Senior Database Administrator': 103, 'Data Entry Specialist': 100, 'Database Developer': 96, 'Oracle Database Administrator': 82, 'Data Entry': 74, 'Machine Learning Intern': 71, 'Big Data Engineer': 70, 'Data Scientist Intern': 69, 'Data Manager': 64, 'Data Analytics Intern': 55, 'Data Specialist': 54, 'Database Engineer': 49, 'Database Consultant': 49, 'Big Data Developer': 48, 'Database Analyst': 46, 'Data Science Fellow': 46, 'Senior Data Architect': 44, 'Data Consultant': 42, 'Business Data Analyst': 41, 'Clinical Data Manager': 39, 'Database Manager': 39, 'Data Science Consultant': 36, 'Data Engineer Intern': 34, 'Lead Data Scientist': 34, 'SQL Database

data of scraping

In [0]:
urls = new_pandas["url"]
urls.to_csv("/Workspace/Users/idanetgar@campus.technion.ac.il/urls2.csv")

In [0]:
urls_skills = pd.read_csv("/Workspace/Users/idanetgar@campus.technion.ac.il/urls_and_skills2.csv")

training data preperation:
saving only relevant rows, and adding a column of experience as sequence(instead of list as before)

In [0]:
import pandas as pd
import re

def contains_terms_with_order(experiences,x,y):
    experiences_str = ' '.join(experiences).lower()

    start_pattern = re.compile(rf'({x})')
    end_pattern = re.compile(rf'({y})')

    start_match = start_pattern.search(experiences_str)
    end_match = end_pattern.search(experiences_str)

    if start_match and end_match:
        return start_match.start() < end_match.start()
    return False

x='data engineer'
y='data scientist'
filtered_df1 = new_pandas[new_pandas['experience'].apply(contains_terms_with_order, args=(x,y))]

filtered_df1.describe()


Unnamed: 0,id,about,url,experience,education,courses
count,49,35,49,49,46,10
unique,49,35,49,33,44,10
top,jesus-y-aab891134,As a versatile professional with experience in...,https://www.linkedin.com/in/jesus-y-aab891134,"[Data Engineer, Data Scientist]",[Computer Science],"[Computer Organization and Design, Data Struct..."
freq,1,1,1,10,3,1


In [0]:
aaa=new_pandas[new_pandas['experience'].apply(lambda x: len(x)>2)]
aaa['experience']

Out[169]: 25       [Data & Analytics Consultant, Data Engineer, S...
71       [Big Data Architect, Data Architect Consultant...
81       [Data Analyst/Data Scientist, Data Analyst/Dat...
93       [Data Analyst, Data Scientist, Data Scientist,...
117      [Data Engineer, Data Science Intern, Data Scie...
                               ...                        
31749    [Data Scientist, Data Scientist, Junior Data S...
31808    [Data Analyst Intern, Data Analyst Intern, Dat...
31839     [Data Scientist, Data Scientist, Data Scientist]
31841    [Data Engineer, Data Engineer, Lead Data Engin...
31863    [Data Analyst, Data Analyst, Machine Learning ...
Name: experience, Length: 1252, dtype: object

In [0]:
import pandas as pd
import re

for index,row in final_df_with_results.iterrows():
    x=row['experience']
    for i in range(3):
        y=row[f'career{i+1}']
        print(x,y)
        filtered_df1 = new_pandas[new_pandas['experience'].apply(contains_terms_with_order, args=(x,y))]
        print(filtered_df1)



data engineer  Data Scientist
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
data engineer  Machine Learning Engineer
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
data engineer  Software Engineer
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
intern data analyst  Analyst
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
intern data analyst  Manager
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
intern data analyst  Educator
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
student data engineer  Data Scientist
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
student data engineer Consultant
Empty DataFrame
Columns: [id, about, url, experience, education, courses]
Index: []
student data engineer Machine Learning Engineer
Empty DataFrame
C

In [0]:
career_paths = [
    ["Data & Analytics Consultant", "Data Engineer", "Senior Data Engineer", "Data Architect"],
    ["Data Analyst", "Data Scientist", "Senior Data Scientist"]
]

# Transform each career path into a structured format
structured_paths = []
for path in career_paths:
    structured_path = " <START> " + " -> ".join(path) + " <END>"
    structured_paths.append(structured_path)

# Example output
for path in structured_paths:
    print(path)

 <START> Data & Analytics Consultant -> Data Engineer -> Senior Data Engineer -> Data Architect <END>
 <START> Data Analyst -> Data Scientist -> Senior Data Scientist <END>


In [0]:
new_pandas['sequenced']=new_pandas['experience'].apply(lambda x: " <START> " + " -> ".join(x) + " <END>")
new_pandas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pandas['sequenced']=new_pandas['experience'].apply(lambda x: " <START> " + " -> ".join(x) + " <END>")


Unnamed: 0,id,about,url,experience,education,courses,sequenced
13,aaron-watkins-57057821,Forging Tangible Business Strategies by Unifyi...,https://www.linkedin.com/in/aaron-watkins-5705...,"[Deepwater Mud logging/ Data engineer, Documen...","[None, None, Business Administration & Law, Ge...",,<START> Deepwater Mud logging/ Data engineer ...
24,abby-liu,,https://www.linkedin.com/in/abby-liu,"[Data Analyst Intern, Senior Data Analyst]",[Business Administration and Management; Econo...,,<START> Data Analyst Intern -> Senior Data An...
25,abbyoverby,"Data, Analytics and BI Engineer with a backgro...",https://www.linkedin.com/in/abbyoverby/,"[Data & Analytics Consultant, Data Engineer, S...","[None, None, None]",,<START> Data & Analytics Consultant -> Data E...
30,abhinav-garg-7841ab33,,https://www.linkedin.com/in/abhinav-garg-7841ab33,"[Data Science Intern, Senior Data Scientist]","[None, Industrial Engineering (Advanced Analyt...","[Advanced Stochastic Processes, Algorithms for...",<START> Data Science Intern -> Senior Data Sc...
31,abhinavsingh9,,https://www.linkedin.com/in/abhinavsingh9,"[Data Scientist Intern, Principal Data Scientist]","[None, Business Analytics, Computational Biology]",,<START> Data Scientist Intern -> Principal Da...
...,...,...,...,...,...,...,...
31863,yuchuanhan,Equipped with multidisciplinary knowledge of s...,https://www.linkedin.com/in/yuchuanhan,"[Data Analyst, Data Analyst, Machine Learning ...","[None, None, None]",,<START> Data Analyst -> Data Analyst -> Machi...
31864,yue-wang-a06b27132,,https://www.linkedin.com/in/yue-wang-a06b27132,"[Data Analyst, Data Science Intern]","[Computer Science, Data Science and Applied St...",,<START> Data Analyst -> Data Science Intern <...
31878,zal-patel-771a24281,,https://www.linkedin.com/in/zal-patel-771a24281,"[Data Analyst, Data Analyst]","[None, Information Technology]",,<START> Data Analyst -> Data Analyst <END>
31880,zenon-cuellar-0a819337,Professional Database Developer moving towards...,https://www.linkedin.com/in/zenon-cuellar-0a81...,"[Database Developer, Database Developer]","[Computer Science, Minor in Mathematics]",,<START> Database Developer -> Database Develo...


In [0]:
!pip install transformers
!pip install torch

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
!pip install torch torchvision torchaudio
!pip install accelerate -U
!pip install transformers[torch]


You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import transformers
import accelerate
print(transformers.__version__)
print(accelerate.__version__)

4.39.2
0.28.0


training the model and saving it

In [0]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set tokenizer pad token to eos_token
tokenizer.pad_token = tokenizer.eos_token
new_pandas=new_pandas[new_pandas['experience'].apply(lambda x: len(x)>2)]
# Assuming you have a DataFrame `new_pandas` and a 'sequenced' column with your formatted sequences
sequences = new_pandas['sequenced'].tolist()

# Tokenize sequences with padding and truncation
encodings = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Define a custom dataset
class SequencedDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Assuming 'input_ids' are used as labels, shifted by one position to predict the next token
        item = {key: val[idx].detach() for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])



# Split your dataset into training and validation sets
train_size = int(0.8 * len(encodings.input_ids))
train_dataset = SequencedDataset({key: val[:train_size] for key, val in encodings.items()})
val_dataset = SequencedDataset({key: val[train_size:] for key, val in encodings.items()})

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.7983
1000,0.239
1500,0.2051


Out[176]: TrainOutput(global_step=1757, training_loss=0.38135900264038713, metrics={'train_runtime': 2824.7497, 'train_samples_per_second': 2.481, 'train_steps_per_second': 0.622, 'total_flos': 250314704640000.0, 'train_loss': 0.38135900264038713, 'epoch': 7.0})

In [0]:
trainer.save_model("/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1")
tokenizer.save_pretrained("/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1")

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
[0;32m<command-1463905093861761>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mtrainer[0m[0;34m.[0m[0msave_model[0m[0;34m([0m[0;34m"/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mtokenizer[0m[0;34m.[0m[0msave_pretrained[0m[0;34m([0m[0;34m"/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-0aef2f5f-bd59-4ee7-8a05-1d4115d824ab/lib/python3.9/site-packages/transformers/trainer.py[0m in [0;36msave_model[0;34m(self, output_dir, _internal_call)[0m
[1;32m   3148[0m [0;34m[0m[0m
[1;32m   3149[0m         [0;32melif[0m [0mself[0m[0;34m.[0m[0margs[0m[0;34m.[0m[0mshould_save

simple model evaluation

In [0]:
start_title = "data engineer"
end_title = "machine learning engineer"

# Prepare the input sequence
input_sequence = "<START> data analyst"

# Tokenize the input sequence
input_ids = tokenizer.encode(input_sequence, return_tensors="pt")

In [0]:
from transformers import pipeline

output_sequences = model.generate(
    input_ids=input_ids,
    max_length=150,  # Adjust based on your needs
    num_return_sequences=1,  # Generate 1 sequence for each input
    temperature=1.0,  # Adjust for creativity
    no_repeat_ngram_size=2,  # Prevent repeating n-grams
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    early_stopping=True
)

# Decode generated sequences
generated_sequences = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output_sequences]

# Print generated sequences
for sequence in generated_sequences:
    print(sequence)


<START> data analyst -> data engineer -> senior data specialist <END>


In [0]:
from transformers import AutoModel, AutoTokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

model = GPT2LMHeadModel.from_pretrained("/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/Workspace/Users/alonaricha@campus.technion.ac.il/trained_model1")

In [0]:
profile_list=[]
for index,row in final_df_with_results.iterrows():    
    profile_careers=[]
    for col in ['career1','career2','career3']:
        start=row['experience']
        end=row[col]
        input_sequence=f"<START> {start} -> {end} <END>"
        input_ids = tokenizer.encode(input_sequence, return_tensors="pt")
        output_sequences = model.generate(
            input_ids=input_ids,
            max_length=150,  # Adjust based on your needs
            num_return_sequences=1,  # Generate 1 sequence for each input
            temperature=1.0,  # Adjust for creativity
            no_repeat_ngram_size=2,  # Prevent repeating n-grams
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

        # Decode generated sequences
        generated_sequences = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output_sequences]

        # Print generated sequences
        for sequence in generated_sequences:
            profile_careers.append(sequence)
    profile_list.append(profile_careers)
profile_list
    

Out[181]: [['<START> data engineer ->  Data Scientist <END>',
  '<START> data engineer ->  Machine Learning Engineer <END>',
  '<START> data engineer ->  Software Engineer <END>'],
 ['<START> intern data analyst ->  Analyst <END>',
  '<START> intern data analyst ->  Manager <END>',
  '<START> intern data analyst ->  Educator <END>'],
 ['<START> student data engineer ->  Data Scientist <END>',
  '<START> student data engineer -> Consultant <END>',
  '<START> student data engineer -> Machine Learning Engineer <END>'],
 ['<START> intern data analyst ->  Data Analyst <END>',
  '<START> intern data analyst ->  Business Analyst <END>',
  '<START> intern data analyst ->  Project Manager <END>'],
 ['<START> intern data scientist ->  Data Engineer <END>',
  '<START> intern data scientist ->  Machine Learning Engineer <END>',
  '<START> intern data scientist ->  Data Scientist <END>']]

goodddddd

In [0]:
from transformers import pipeline

def is_valid_transition(prev_title, next_title):
    """
    Checks if the transition from prev_title to next_title is valid
    according to the defined progression rules.
    """
    # Simplify titles for comparison
    prev_title_simple = prev_title.lower().replace("senior ", "")
    next_title_simple = next_title.lower().replace("senior ", "")
    
    # Check for consecutive "intern" positions
    if "intern" in next_title_simple:
        return False

    # Check if next_title is a "demotion" from prev_title
    if prev_title_simple != next_title_simple and (prev_title_simple in next_title.lower() or next_title_simple in prev_title.lower()):
        return False
    return True

# Initialize the generation pipeline with your model
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
# def is_valid_transition(prev_title, next_title):
#     """
#     Checks if the transition from prev_title to next_title is valid
#     according to the defined progression rules.
#     """
#     # Simplify titles for comparison
#     prev_title_simple = prev_title.lower().replace("senior ", "")
#     next_title_simple = next_title.lower().replace("senior ", "")

#     # Check if next_title is a "demotion" from prev_title
#     if prev_title_simple != next_title_simple and (prev_title_simple in next_title.lower() or next_title_simple in prev_title.lower()):
#         return False
#     return True

def generate_career_path(start_title, end_title, max_steps=4, max_attempts=5):
    current_prompt = f"<START> {start_title}"
    steps = 0
    career_path = [start_title.lower()]  # Use lowercase for comparison

    while steps < max_steps:
        attempt = 0
        next_career_valid = False

        while attempt < max_attempts and not next_career_valid:
            # Generate the next step
            outputs = generator(current_prompt, max_length=150, num_return_sequences=1, temperature=1.0, top_p=0.92)
            generated_text = outputs[0]['generated_text']

            # Attempt to extract the next career title from `generated_text`
            next_career = generated_text.split("->")[-1].split("<END>")[0].strip()  # Splitting at <END> to avoid including it

            # Check if the transition is valid
            if next_career.lower() not in career_path and is_valid_transition(career_path[-1], next_career):
                next_career_valid = True
            else:
                attempt += 1  # Increment attempt if the generated title is a repeat or invalid transition

        if not next_career_valid:
            print("Unable to generate a new, unique, and valid title after several attempts.")
            break  # Exit if a new, unique title cannot be generated

        # Add the new, unique, and valid title to the path
        career_path.append(next_career.lower())  # Store in lowercase for comparison
        current_prompt = f"{generated_text} ->"
        steps += 1

    # Capitalize the career path for presentation
    career_path = [title.capitalize() for title in career_path]
    return career_path

# Example usage
start_title = "student data engineer"
end_title = "data scientist"
career_path = generate_career_path(start_title, end_title)
print(" -> ".join(career_path))
print(career_path)


Unable to generate a new, unique, and valid title after several attempts.
Student data engineer -> Student data specialist
['Student data engineer', 'Student data specialist']


gooddddd

In [0]:
profile_list=[]
for index,row in final_df_with_results.iterrows():    
    profile_careers=[]
    for col in ['career1','career2','career3']:
        start=row['experience']
        end=row[col]
        career_path = generate_career_path(start, end)
        # input_sequence=f"<START> {start}"
        # input_ids = tokenizer.encode(input_sequence, return_tensors="pt")
        # output_sequences = model.generate(
        #     input_ids=input_ids,
        #     max_length=150,  # Adjust based on your needs
        #     num_return_sequences=1,  # Generate 1 sequence for each input
        #     temperature=1.0,  # Adjust for creativity
        #     no_repeat_ngram_size=2,  # Prevent repeating n-grams
        #     eos_token_id=tokenizer.eos_token_id,
        #     pad_token_id=tokenizer.eos_token_id,
        #     early_stopping=True
        # )

        # # Decode generated sequences
        # generated_sequences = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output_sequences]

        # # Print generated sequences
        # for sequence in generated_sequences:
        profile_careers.append(career_path)
    profile_list.append(profile_careers)

for i,l in enumerate(profile_list):
    print()
    print("PROFILE:")
    for j,c in enumerate(l):
        print(c, final_df_with_results["career"+str(j+1)][i])

Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and valid title after several attempts.
Unable to generate a new, unique, and 

In [0]:
for l in profile_list:
    for i,c in enumerate(l):
        if len(c)>3:
            l[i] = c[:3]

In [0]:
for i,l in enumerate(profile_list):
    print()
    print("PROFILE:")
    for j,c in enumerate(l):
        print(c, final_df_with_results["career"+str(j+1)][i])


PROFILE:
['Data engineer']  Data Scientist
['Data engineer', 'Senior data engineer']  Machine Learning Engineer
['Data engineer', 'Senior data engineer']  Software Engineer

PROFILE:
['Intern data analyst', 'Data engineer']  Data Analytics Manager
['Intern data analyst', 'Data engineer', 'Senior data engineer']  Data Scientist
['Intern data analyst']  Business Analyst

PROFILE:
['Student data engineer']  Data Scientist
['Student data engineer']  Data Analyst
['Student data engineer', 'Student data analyst']  Machine Learning Engineer

PROFILE:
['Intern data analyst', 'Data specialist', 'Senior data analyst']  Data Engineer
['Intern data analyst'] Data Scientist
['Intern data analyst', 'Data manager', 'Data analyst'] Software Engineer

PROFILE:
['Intern data scientist', 'Data specialist', 'Senior data engineer']  Data Engineer
['Intern data scientist', 'Volunteer data analyst', 'Staff data scientist']  Machine Learning Engineer
['Intern data scientist', 'Data analyst']  Data Analyst


In [0]:
for l in profile_list:
    for c in l:
        if len(c) < 3:
            c.append("empty")
        if len(c) < 3:
            c.append("empty")

In [0]:
for i,l in enumerate(profile_list):
    print()
    print("PROFILE:")
    for j,c in enumerate(l):
        print(c, final_df_with_results["career"+str(j+1)][i])


PROFILE:
['Data engineer', 'empty', 'empty']  Data Scientist
['Data engineer', 'Senior data engineer', 'empty']  Machine Learning Engineer
['Data engineer', 'Senior data engineer', 'empty']  Software Engineer

PROFILE:
['Intern data analyst', 'Data engineer', 'empty']  Data Analytics Manager
['Intern data analyst', 'Data engineer', 'Senior data engineer']  Data Scientist
['Intern data analyst', 'empty', 'empty']  Business Analyst

PROFILE:
['Student data engineer', 'empty', 'empty']  Data Scientist
['Student data engineer', 'empty', 'empty']  Data Analyst
['Student data engineer', 'Student data analyst', 'empty']  Machine Learning Engineer

PROFILE:
['Intern data analyst', 'Data specialist', 'Senior data analyst']  Data Engineer
['Intern data analyst', 'empty', 'empty'] Data Scientist
['Intern data analyst', 'Data manager', 'Data analyst'] Software Engineer

PROFILE:
['Intern data scientist', 'Data specialist', 'Senior data engineer']  Data Engineer
['Intern data scientist', 'Voluntee

In [0]:
middle1 = [[],[],[]]
middle2 = [[],[],[]]
for l in profile_list:
    for i,c in enumerate(l):
        for j,p in enumerate(c):
            if j == 1:
                middle1[i].append(p)
            if j == 2:
                middle2[i].append(p)

final_df_with_results["career1_step1"] = middle1[0]
final_df_with_results["career1_step2"] = middle2[0]
final_df_with_results["career2_step1"] = middle1[1]
final_df_with_results["career2_step2"] = middle2[1]
final_df_with_results["career3_step1"] = middle1[2]
final_df_with_results["career3_step2"] = middle2[2]

final_df_with_results.display()

id,about,education,experience,courses,q1,q2,q3,q4,career1,career2,career3,career1_step1,career1_step2,career2_step1,career2_step2,career3_step1,career3_step2
212779797,Data science and engineering Student at Technion - Israel Institute of Technology,Data science,data engineer,,yes,yes,no,no,Data Scientist,Machine Learning Engineer,Software Engineer,empty,empty,Senior data engineer,empty,Senior data engineer,empty
212708754,data science & engineering student,Data science,intern data analyst,,no,no,yes,yes,Data Analytics Manager,Data Scientist,Business Analyst,Data engineer,empty,Data engineer,Senior data engineer,empty,empty
212777676,data science & engineering student,Data science,student data engineer,,yes,no,no,yes,Data Scientist,Data Analyst,Machine Learning Engineer,empty,empty,empty,empty,Student data analyst,empty
213376551,industrial engineering student,industrial engineering,intern data analyst,,no,no,yes,yes,Data Engineer,Data Scientist,Software Engineer,Data specialist,Senior data analyst,empty,empty,Data manager,Data analyst
213101819,Data science and engineering master,data science,intern data scientist,,yes,yes,yes,no,Data Engineer,Machine Learning Engineer,Data Analyst,Data specialist,Senior data engineer,Volunteer data analyst,Staff data scientist,Data analyst,empty


2nd try

In [0]:
urls_skills = pd.read_csv("/Workspace/Users/idanetgar@campus.technion.ac.il/urls_and_skills2.csv")
urls_skills.dropna(subset=['skills'], inplace=True)
urls_skills['skills'] = urls_skills['skills'].str.replace('"', '')
urls_skills['skills'] = urls_skills['skills'].str.replace('[', '')
urls_skills['skills'] = urls_skills['skills'].str.replace(']', '')
urls_skills["skills"] = urls_skills['skills'].str.split(',', expand=False)

  urls_skills['skills'] = urls_skills['skills'].str.replace('[', '')
  urls_skills['skills'] = urls_skills['skills'].str.replace(']', '')


In [0]:
people_skills = pd.merge(new_pandas, urls_skills, on='id', how='inner')
people_skills["sequenced"] = people_skills["sequenced"].str.lower()
people_skills

Unnamed: 0,id,about,url_x,experience,education,courses,sequenced,url_y,skills,input_url,warning,warning_code,error
0,abbyoverby,"Data, Analytics and BI Engineer with a backgro...",https://www.linkedin.com/in/abbyoverby/,"[Data & Analytics Consultant, Data Engineer, S...","[None, None, None]",,<start> data & analytics consultant -> data e...,https://www.linkedin.com/in/abbyoverby,"[Data Engineering Foundations, Transition from...",https://www.linkedin.com/in/abbyoverby/,,,
1,ajay-kharade-21156147,,https://www.linkedin.com/in/ajay-kharade-21156147,"[Big Data Architect, Data Architect Consultant...",[Computer Science],,<start> big data architect -> data architect ...,https://www.linkedin.com/in/ajay-kharade,[],https://www.linkedin.com/in/ajay-kharade-21156147,,,
2,akshay-bhat-31a538218,• Data Analyst with 4+ years of experience int...,https://www.linkedin.com/in/akshay-bhat-31a538218,"[Data Analyst/Data Scientist, Data Analyst/Dat...",[Business Analytics],,<start> data analyst/data scientist -> data a...,https://www.linkedin.com/in/akshay-bhat-31a538218,"[Python for Data Science Tips, Tricks, & Tec...",https://www.linkedin.com/in/akshay-bhat-31a538218,,,
3,aleksandra-deis-0912,I am a business analyst turned data scientist ...,https://www.linkedin.com/in/aleksandra-deis-0912,"[Data Analyst, Data Scientist, Data Scientist,...",[Computational Mathematics and Cybernetics],,<start> data analyst -> data scientist -> dat...,https://www.linkedin.com/in/aleksandra-deis-09...,"[Supervised Learning Essential Training, Machi...",https://www.linkedin.com/in/aleksandra-deis-0912,,,
4,alexnakagawa,,https://www.linkedin.com/in/alexnakagawa,"[Data Engineer, Data Science Intern, Data Scie...","[Data Science (Emphasis in Cognition), Industr...",,<start> data engineer -> data science intern ...,https://www.linkedin.com/in/alexnakagawa,"[JSON Processing with Java EE, Building React ...",https://www.linkedin.com/in/alexnakagawa,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1178,udaivarshney,"""Build Influence Without Authority"". ""Pursuit ...",https://www.linkedin.com/in/udaivarshney,"[Production Oracle Database Administrator, App...","[None, None, Physics]",,<start> production oracle database administra...,https://www.linkedin.com/in/udaivarshney,[Oracle DB Cloud Database Migration and Integr...,https://www.linkedin.com/in/udaivarshney,,,
1179,wei-jin-708612178,"As an enthusiastic lover of data, I'm passiona...",https://www.linkedin.com/in/wei-jin-708612178,"[Data Analyst Intern, Data Analyst Intern, Dat...",[Statistics],,<start> data analyst intern -> data analyst i...,https://www.linkedin.com/in/wei-jin-708612178,[Machine Learning with Python: k-Means Cluster...,https://www.linkedin.com/in/wei-jin-708612178,,,
1180,yaoguangzhai109244,"As a Ph.D. Candidate on Computer Science, my p...",https://www.linkedin.com/in/yaoguangzhai109244,"[Data Scientist, Data Scientist, Data Scientist]","[Computational Science, Mathematics and Engine...",,<start> data scientist -> data scientist -> d...,https://www.linkedin.com/in/yaoguangzhai109244,"[Faster pandas, Python Parallel and Concurrent...",https://www.linkedin.com/in/yaoguangzhai109244,,,
1181,yash-vajifdar,I have been leading ambiguous projects designi...,https://www.linkedin.com/in/yash-vajifdar,"[Data Engineer, Data Engineer, Lead Data Engin...","[Business Entrepreneurial & Management, Genera...","[Intro to Python, Intro to SQL]",<start> data engineer -> data engineer -> lea...,https://www.linkedin.com/in/yash-vajifdar,"[Hands-On Data Science using SQL, Tableau, P...",https://www.linkedin.com/in/yash-vajifdar,,,


In [0]:
def find_skills(people_skills, career):
    career = career.lower().strip()
    df = people_skills[people_skills['sequenced'].str.contains(career)]
    df = pd.DataFrame(df["skills"])
    df = df.explode("skills")
    count_df = df["skills"].value_counts()
    count_df = pd.DataFrame(count_df)
    count_df['course'] = count_df.index
    count_df = count_df.sort_values(by='skills', ascending=False)
    courses = count_df["course"][:3].tolist()
    return courses

init_cols = final_df_with_results.columns
for col in init_cols:
    if "career" in col and "courses" not in col:
        to_add = []
        for index,row in final_df_with_results.iterrows():
            to_add.append(find_skills(people_skills, row[col]))
        final_df_with_results[col+"_courses"] = to_add

In [0]:
for col in final_df_with_results.columns:
    if final_df_with_results[col].dtype == 'O' and 'courses' not in col:
        final_df_with_results[col] = final_df_with_results[col].str.strip()
final_df_with_results

Unnamed: 0,id,about,education,experience,courses,q1,q2,q3,q4,career1,career2,career3,career1_step1,career1_step2,career2_step1,career2_step2,career3_step1,career3_step2,career1_courses,career2_courses,career3_courses,career1_step1_courses,career1_step2_courses,career2_step1_courses,career2_step2_courses,career3_step1_courses,career3_step2_courses
0,212779797,Data science and engineering Student at Techni...,Data science,data engineer,,yes,yes,no,no,Data Scientist,Machine Learning Engineer,Software Engineer,empty,empty,Senior data engineer,empty,Senior data engineer,empty,"[, R, Tableau]","[TensorFlow 2.0: Working with Images, Building...","[, R, Applied AI: Getting Started with Huggin...",[],[],"[Data Engineering Foundations, , and Spark]",[],"[Data Engineering Foundations, , and Spark]",[]
1,212708754,data science & engineering student,Data science,intern data analyst,,no,no,yes,yes,Data Analytics Manager,Data Scientist,Business Analyst,Data engineer,empty,Data engineer,Senior data engineer,empty,empty,[Machine Learning with Logistic Regression in ...,"[, R, Tableau]",[],"[Data Engineering Foundations, Learning Hadoop...",[],"[Data Engineering Foundations, Learning Hadoop...","[Data Engineering Foundations, , and Spark]",[],[]
2,212777676,data science & engineering student,Data science,student data engineer,,yes,no,no,yes,Data Scientist,Data Analyst,Machine Learning Engineer,empty,empty,empty,empty,Student data analyst,empty,"[, R, Tableau]","[ and Spark, Python, Tableau]","[TensorFlow 2.0: Working with Images, Building...",[],[],[],[],[],[]
3,213376551,industrial engineering student,industrial engineering,intern data analyst,,no,no,yes,yes,Data Engineer,Data Scientist,Software Engineer,Data specialist,Senior data analyst,empty,empty,Data manager,Data analyst,"[Data Engineering Foundations, Learning Hadoop...","[, R, Tableau]","[, R, Applied AI: Getting Started with Huggin...","[Finding New Career Paths with SQL, and Spark...","[ Tableau, and Spark, R]",[],[],"[Learning Google Cloud Run, Serving Customers ...","[ and Spark, Python, Tableau]"
4,213101819,Data science and engineering master,data science,intern data scientist,,yes,yes,yes,no,Data Engineer,Machine Learning Engineer,Data Analyst,Data specialist,Senior data engineer,Volunteer data analyst,Staff data scientist,Data analyst,empty,"[Data Engineering Foundations, Learning Hadoop...","[TensorFlow 2.0: Working with Images, Building...","[ and Spark, Python, Tableau]","[Finding New Career Paths with SQL, and Spark...","[Data Engineering Foundations, , and Spark]",[],"[Introduction to Artificial Intelligence, Buil...","[ and Spark, Python, Tableau]",[]


In [0]:
# EXAMPLE OF USAGE
example = final_df_with_results.iloc[1]
output = f"From a posiotion of '{example['experience']}', with an education in '{example['education']}' and your personality questionnaire here are 3 possible career paths you might like to explore:\n\n\n"

for i in range(1,4):
    output += f"Pilot Path to becoming a '{example['career'+str(i)]}':\n    "
    output += f"{example['experience']}  ->  "
    if example['career'+str(i)+'_step1'] != 'empty':
        output += f"{example['career'+str(i)+'_step1']}  ->  "
    if example['career'+str(i)+'_step2'] != 'empty':
        output += f"{example['career'+str(i)+'_step2']}  ->  "
    output += f"{example['career'+str(i)]}\n\n    "
    output += f"Here are some recommended LinkedIn courses and Skills that will help you advance in the path:\n"
    if example['career'+str(i)+'_step1'] != 'empty':
        output += f"    - {example['career'+str(i)+'_step1']}: "
        courses = ''
        for c in example['career'+str(i)+'_step1_courses']:
            courses += c+", "
        courses = courses[:-2]
        if courses == '':
            output += f"Courses & Skills were not found. We recommend to look into '{example['career'+str(i)]}' LinkedIn profiles."
        output += courses+"\n"
    if example['career'+str(i)+'_step2'] != 'empty':
        output += f"    - {example['career'+str(i)+'_step2']}: "
        courses = ''
        for c in example['career'+str(i)+'_step2_courses']:
            if c == '':
                continue
            courses += c+", "
        courses = courses[:-2]
        if courses == '':
            output += f"Courses & Skills were not found. We recommend to look into '{example['career'+str(i)]}' LinkedIn profiles."
        output += courses+"\n"
    if example['career'+str(i)] != 'empty':
        output += f"    - {example['career'+str(i)]}: "
        courses = ''
        for c in example['career'+str(i)+'_courses']:
            if c == '':
                continue
            courses += c+", "
        courses = courses[:-2]
        if courses == '':
            output += f"Courses & Skills were not found. We recommend to look into '{example['career'+str(i)]}' LinkedIn profiles."
        output += courses+"\n"
    output += "\n\n"


print(output)

From a posiotion of 'intern data analyst', with an education in 'Data science' and your personality questionnaire here are 3 possible career paths you might like to explore:


Pilot Path to becoming a 'Data Analytics Manager':
    intern data analyst  ->  Data engineer  ->  Data Analytics Manager

    Here are some recommended LinkedIn courses and Skills that will help you advance in the path:
    - Data engineer: Data Engineering Foundations, Learning Hadoop,  Tableau
    - Data Analytics Manager: Machine Learning with Logistic Regression in Excel,  R,  and Power BI


Pilot Path to becoming a 'Data Scientist':
    intern data analyst  ->  Data engineer  ->  Senior data engineer  ->  Data Scientist

    Here are some recommended LinkedIn courses and Skills that will help you advance in the path:
    - Data engineer: Data Engineering Foundations, Learning Hadoop,  Tableau
    - Senior data engineer: Data Engineering Foundations,  and Spark
    - Data Scientist:  R,  Tableau


Pilot Path

In [0]:
import time

time.sleep(1200)