<a href="https://colab.research.google.com/github/PremMall/Machine-Learning/blob/main/ML_Data_Job.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import bigquery
client = bigquery.Client(project="jobprojectlewagon")


In [None]:
table = "data_job_market_analysis.dtajob_clean_V2"
df = client.list_rows(table).to_dataframe()

In [None]:
import pandas as pd
import numpy as np
import ast
from collections import defaultdict, Counter
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import SGDClassifier
#from sklearn.multiclass import OneVsRestClassifier
#from sklearn.metrics import hamming_loss, f1_score
#from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import pandas as pd
from itertools import combinations
from collections import Counter

# 1. Convert comma-separated strings into lists
df['skills_list'] = df['skills_list'].str.split(',\s*', regex=True)

# 2. Explode lists so each skill is in a separate row
df_exploded = df.explode('skills_list')

# 3. Standardise skill names: lowercase and strip whitespace
df_exploded['skills_list'] = df_exploded['skills_list'].str.lower().str.strip()

# 4. Optional: Map common synonyms or variations to standard skill names
skill_mapping = {
    'ms excel': 'excel',
    'excel': 'excel',
    'py': 'python',
    'python': 'python',
    'postgresql': 'sql',
    'sql': 'sql',
    'c#': 'csharp',
    'power bi': 'powerbi',
    'aws': 'amazon web services',
    'gcp': 'google cloud',
    'azure': 'microsoft azure'
    # Add more mappings as needed
}
df_exploded['skills_list'] = df_exploded['skills_list'].replace(skill_mapping)

# 5. Remove unknown or missing skills
df_exploded = df_exploded[df_exploded['skills_list'] != 'unknown']
df_exploded = df_exploded.dropna(subset=['skills_list'])

# 6. Optional: Remove rare skills (appearing fewer than a threshold)
threshold = 10
skill_counts = df_exploded['skills_list'].value_counts()
common_skills = skill_counts[skill_counts > threshold].index
df_exploded = df_exploded[df_exploded['skills_list'].isin(common_skills)]

# 7. Create a frequency table for analysis or visualisation
skill_freq = df_exploded['skills_list'].value_counts().reset_index()
skill_freq.columns = ['skill', 'count']

# 8. Optional: Compute co-occurrences for skills analysis
all_combinations = df['skills_list'].dropna().apply(lambda x: list(combinations(x, 2)))
flat_combinations = [combo for sublist in all_combinations for combo in sublist]
co_occurrence = Counter(flat_combinations)
co_occurrence_df = pd.DataFrame(co_occurrence.items(), columns=['skill_pair', 'count']).sort_values(by='count', ascending=False)

# Now df_exploded is clean, skill_freq has counts, co_occurrence_df has pairs


  df['skills_list'] = df['skills_list'].str.split(',\s*', regex=True)


In [None]:
def split_skills(x):
    if isinstance(x, str):
        return [s.strip().lower() for s in x.split(',') if s.strip()]
    elif isinstance(x, (list, tuple, set)):
        return [str(s).strip().lower() for s in x]
    else:
        return []
df['skills_list'] = df['skills_list'].apply(split_skills)

In [None]:
def split_type(x):
    if isinstance(x, str):
        return [s.strip().lower() for s in x.split(',') if s.strip()]
    elif isinstance(x, (list, tuple, set)):
        return [str(s).strip().lower() for s in x]
    else:
        return []
df['skills_types'] = df['skills_types'].apply(split_type)

In [None]:
df['job_scheduled'] = df['job_scheduled'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [None]:
print(df['skills_list'].head())

0    [python, power bi, tableau, sql, r]
1                 [power bi, sql, excel]
2       [python, bigquery, tableau, sql]
3              [linux, azure, snowflake]
4                        [spark, python]
Name: skills_list, dtype: object


In [None]:
print(df['skills_types'].head())

0           [programming, analyst_tools]
1           [analyst_tools, programming]
2    [programming, analyst_tools, cloud]
3                            [cloud, os]
4               [programming, libraries]
Name: skills_types, dtype: object


In [None]:
df['skills_list'] = df['skills_list'].apply(split_skills)
df['skills_types'] = df['skills_types'].apply(split_skills)

In [None]:
def make_labels(row):
    labels = []

    # job_category
    if pd.notna(row['job_category']):
        labels.append(str(row['job_category']).strip().lower())

    # job_country
    if pd.notna(row['job_country']):
        labels.append(str(row['job_country']).strip().lower())

    # job_scheduled (always a list)
    for s in row['job_scheduled']:
        labels.append(str(s).strip().lower())

    # skills_list (always a list)
    for skill in row['skills_list']:
        labels.append(str(skill).strip().lower())

    return labels

In [None]:
df['labels'] = df.apply(make_labels, axis=1)
print(df['labels'].head())

0    [data_analyst, spain, full-time, python, power...
1    [data_analyst, spain, full-time, power bi, sql...
2    [analytics_engineer, sweden, full-time, python...
3    [data_engineer, switzerland, full-time, linux,...
4    [data_engineer, switzerland, full-time, spark,...
Name: labels, dtype: object


The following "defaultdict(counter)" builds lookup tables.Every time a skill etc. is encountered, the relevant counts automatically increases.

In [None]:
skill_to_category = defaultdict(Counter)
skill_to_country = defaultdict(Counter)
skill_to_schedule = defaultdict(Counter)
skill_to_skills = defaultdict(Counter)
skill_to_types = defaultdict(Counter)

for row in df.itertuples():
    skills = row.skills_list
    types = row.skills_types if hasattr(row, 'skills_types') else [None]*len(skills)

    category = row.job_category.lower()
    country = row.job_country.lower()
    schedules = row.job_scheduled  # this is a list

    for i, skill in enumerate(skills):
        skill = skill.lower()
        skill_type = types[i].lower() if types and i < len(types) else None

        skill_to_category[skill][category] += 1
        skill_to_country[skill][country] += 1

        for sched in schedules:
            skill_to_schedule[skill][sched.lower()] += 1

        for other_skill in skills:
            if other_skill.lower() != skill:
                skill_to_skills[skill][other_skill.lower()] += 1

        if skill_type:
            skill_to_types[skill][skill_type] += 1


In [None]:
def recommend_job_attributes(input_skills_text, top_n_additional_skills=5):
    #from collections import Counter

    # Convert text to list of skills
    input_skills = [s.strip().lower() for s in input_skills_text.split(',') if s.strip()]

    category_counter = Counter()
    country_counter = Counter()
    schedule_counter = Counter()
    additional_skills_counter = Counter()
    skill_type_counter = Counter()

    for skill in input_skills:
        category_counter.update(skill_to_category.get(skill, {}))
        country_counter.update(skill_to_country.get(skill, {}))
        schedule_counter.update(skill_to_schedule.get(skill, {}))
        additional_skills_counter.update(skill_to_skills.get(skill, {}))
        skill_type_counter.update(skill_to_types.get(skill, {}))

    # Remove input skills from recommendations
    for skill in input_skills:
        additional_skills_counter.pop(skill, None)

    # Prioritize additional skills by type
    top_types = [t for t, _ in skill_type_counter.most_common(3)]
    filtered_additional_skills = []
    for s, _ in additional_skills_counter.most_common():
        s_type_counts = skill_to_types.get(s, {})
        if any(t in top_types for t in s_type_counts):
            filtered_additional_skills.append(s)
        if len(filtered_additional_skills) >= top_n_additional_skills:
            break

    # Fill with remaining frequent co-occurring skills
    while len(filtered_additional_skills) < top_n_additional_skills and additional_skills_counter:
        for s, _ in additional_skills_counter.most_common():
            if s not in filtered_additional_skills:
                filtered_additional_skills.append(s)
            if len(filtered_additional_skills) >= top_n_additional_skills:
                break

    return {
        "job_category": category_counter.most_common(1)[0][0] if category_counter else None,
        "job_country": country_counter.most_common(1)[0][0] if country_counter else None,
        "job_schedule_type": schedule_counter.most_common(1)[0][0] if schedule_counter else None,
        "recommended_skills": filtered_additional_skills
    }


In [None]:
!pip install ipywidgets --quiet

import ipywidgets as widgets
from IPython.display import display, clear_output

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.6/1.6 MB[0m [31m22.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.6/1.6 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Input box for skills
input_skills = widgets.Text(
    value='',
    placeholder='Enter skills separated by commas',
    description='Skills:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)

# Output display box
output_box = widgets.Output()

# Button to trigger prediction
predict_button = widgets.Button(
    description="Recommend Jobs",
    button_style='success',
    tooltip='Click to get job recommendations'
)

In [None]:
def on_predict_button_clicked(b):
    with output_box:
        clear_output()
        user_input = input_skills.value
        if not user_input.strip():
            print("Please enter at least one skill.")
            return

        # Call your recommend_job_attributes function
        result = recommend_job_attributes(user_input)

        print("üîπ Predicted Job Category:", result['job_category'])
        print("üîπ Most Likely Country:", result['job_country'])
        print("üîπ Job Schedule Type:", result['job_schedule_type'])
        print("üîπ Recommended Additional Skills:", ', '.join(result['recommended_skills']))

In [None]:
from IPython.display import display, HTML, clear_output

def on_predict_button_clicked(b):
    with output_box:
        clear_output()
        user_input = input_skills.value
        if not user_input.strip():
            print("Please enter at least one skill.")
            return
        result = recommend_job_attributes(user_input)

        # Use HTML for nicely formatted output
        display(HTML(f"<b>üîπ Predicted Job Category:</b> {result['job_category']}"))
        display(HTML(f"<b>üîπ Most Likely Country:</b> {result['job_country']}"))
        display(HTML(f"<b>üîπ Job Schedule Type:</b> {result['job_schedule_type']}"))
        display(HTML(f"<b>üîπ Recommended Additional Skills:</b> {', '.join(result['recommended_skills'])}"))


In [None]:
predict_button.on_click(on_predict_button_clicked)
display(input_skills, predict_button, output_box)

Text(value='', description='Skills:', layout=Layout(width='50%'), placeholder='Enter skills separated by comma‚Ä¶

Button(button_style='success', description='Recommend Jobs', style=ButtonStyle(), tooltip='Click to get job re‚Ä¶

Output()