##### Load csv files from storage account

In [None]:
df = spark.read.option("header", "true") \
               .option("quote", '"') \
               .csv("your path here")   # Replace "your path here" with the actual path to your CSV file

df_pd = df.toPandas()

display(df_pd)


StatementMeta(, 09cfba99-5657-42ce-86ff-1b90a87f9824, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6754901b-865b-462d-af05-787d9e575d57)

##### Clean column names

In [2]:
# Clean column names (snake_case) and handle any text issues in description
df_pd.columns = [col.lower().replace(" ", "_").replace("ä", "a") for col in df_pd.columns]

display(df_pd)

StatementMeta(, 09cfba99-5657-42ce-86ff-1b90a87f9824, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, b7518312-1328-42a8-9532-2f4645bc5570)

##### Clean emojis from description, drop rows with irrelevant job titles, drop duplicates and extract only job titles from the title column

In [3]:
import re

# Function to clean emojis
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "]", flags=re.UNICODE)
    return re.sub(emoji_pattern, '', text)

def clean_title(title):
    # Keywords for cleansing
    keywords = [
        "ai", "engineer", "analyst", "architect", "programmer", "datainsinööri",
        "scientist", "trainee", "development", "developer", "support", "insinööri"
        "analyytikko", "engineers", "engineering","analysts", "scientists", "consultant", "principal"
    ]
    
    # Regex pattern to match keywords (case-insensitive)
    pattern = r'\b(' + '|'.join(keywords) + r')\b'
    
    # Find all matches in the title
    matches = list(re.finditer(pattern, title, flags=re.IGNORECASE))
    
    if matches:
        # Keep the title up to and including the last matched keyword
        last_match = matches[-1]  # The last match in the list
        truncated_title = title[:last_match.start() + len(last_match.group(0))].strip()
        
        # Check if the truncated title has unbalanced parentheses
        open_parens = truncated_title.count("(")
        close_parens = truncated_title.count(")")
        
        # If there are more open parentheses, we add the missing closing parenthesis
        if open_parens > close_parens:
            truncated_title += ")"
        
        return truncated_title
    
    # If no match, return the title unchanged
    return title.strip()

# Apply the cleaning function to the 'title' column
df_pd['title'] = df_pd['title'].apply(clean_title)

# Remove emojis from the 'description' column
df_pd['description'] = df_pd['description'].apply(remove_emojis)

# Filter to only keep rows where a keyword match was found
df_pd = df_pd[df_pd['title'].apply(lambda title: bool(re.search(r'\b(ai|engineer|analyst|architect|programmer|datainsinööri|scientist|trainee|development|developer|support|insinööri|analyytikko|engineers|analysts|scientists|consultant|principal)\b', title, flags=re.IGNORECASE)))] 

# Drop duplicates based on 'title' and 'company' columns
df_pd = df_pd.drop_duplicates(subset=['title', 'company'])

# Display the cleaned dataframe
display(df_pd)

StatementMeta(, 09cfba99-5657-42ce-86ff-1b90a87f9824, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 13f10ffb-0839-456a-8f45-d3ff3f870095)

##### Convert back to Spark Dataframe, change column datatypes and upload to table in lakehouse

In [4]:
from pyspark.sql.functions import col, when, to_date  # Make sure to import to_date

cleaned_df = spark.createDataFrame(df_pd)

# List of boolean columns that need to be cast
boolean_columns = [
        'azure', 'python', 'sql', 'fabric', 'power_bi', 'koneoppiminen', 'ai', 'tekoaly', 
        'machine_learning', 'analytiikka', 'analytics', 'synapse', 'databricks', 'aws',
        'amazon_web_services', 'google_cloud', 'gcp', 'snowflake', 'bigquery', 'looker',
        'spark', 'etl', 'elt', 'pipeline', 'data_mesh', 'data_vault', 'api', 'kafka',
        'tableau', 'oracle', 'tietomallinnus', 'datan_mallinnus', 'data_modeling',
        'data_warehousing', 'tietovarastointi', 'tietovarasto', 'dataintegraatio',
        'kql', 'data_lake', 'dataputki']

# Loop through each column and apply the transformation
for column in boolean_columns:
    cleaned_df = cleaned_df.withColumn(column, when(cleaned_df[column] == 1, True).otherwise(False).cast('boolean'))

# Change data types for each column
cleaned_df = cleaned_df.withColumn('y_tunnus', col('y_tunnus').cast('string')) \
                   .withColumn('company', col('company').cast('string')) \
                   .withColumn('title', col('title').cast('string')) \
                   .withColumn('scrape_date', to_date('scrape_date', 'yyyy-MM-dd')) \
                   .withColumn('location', col('location').cast('string')) \
                   .withColumn('description', col('description').cast('string')) 

display(cleaned_df)
cleaned_df.printSchema()

# Write silver table to lakehouse
cleaned_df.write.format("delta").mode("overwrite").saveAsTable("job_postings_silver")

print("Data saved to lakehouse")


StatementMeta(, 09cfba99-5657-42ce-86ff-1b90a87f9824, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a30832c1-bed4-4893-9b21-80b83f789cf0)

root
 |-- y_tunnus: string (nullable = true)
 |-- company: string (nullable = true)
 |-- scrape_date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- description: string (nullable = true)
 |-- azure: boolean (nullable = false)
 |-- python: boolean (nullable = false)
 |-- sql: boolean (nullable = false)
 |-- fabric: boolean (nullable = false)
 |-- power_bi: boolean (nullable = false)
 |-- koneoppiminen: boolean (nullable = false)
 |-- ai: boolean (nullable = false)
 |-- tekoaly: boolean (nullable = false)
 |-- machine_learning: boolean (nullable = false)
 |-- analytiikka: boolean (nullable = false)
 |-- analytics: boolean (nullable = false)
 |-- synapse: boolean (nullable = false)
 |-- databricks: boolean (nullable = false)
 |-- aws: boolean (nullable = false)
 |-- amazon_web_services: boolean (nullable = false)
 |-- google_cloud: boolean (nullable = false)
 |-- gcp: boolean (nullable = false)
 |-- snowflake: boolean (nullable = f