In [17]:
import pandas as pd
import re
import psycopg2
from psycopg2.extras import execute_values

# --------------------------------------
# 1. READ CSV + CLEAN EXTRA SPACES
# --------------------------------------

def clean_spaces(val):
    """Remove duplicate/multiple spaces in any cell."""
    if pd.isna(val):
        return val
    return re.sub(r"\s+", " ", str(val)).strip()

csv_path = "DataEngineer_Cleaned.csv"   # <--- your CSV file path
df = pd.read_csv(csv_path, dtype=str).applymap(clean_spaces)


  df = pd.read_csv(csv_path, dtype=str).applymap(clean_spaces)


In [18]:
# --------------------------------------
# 2. DATABASE CONNECTION
# --------------------------------------

conn = psycopg2.connect(
    host="jde08-ip-p2-angbj1976-c47c.c.aivencloud.com",
    dbname="Interim_Project_DB",
    user="********",
    password="********",
    port=15241
)

cur = conn.cursor()


In [19]:
# --------------------------------------
# 3. VERIFY COUNTRIES IN dim_countries
# --------------------------------------

def get_existing_countries():
    cur.execute("SELECT country_code FROM dim_countries;")
    return {row[0]: row[0] for row in cur.fetchall()}  # country_name → code

existing_countries = get_existing_countries()

unique_countries = df["Country"].dropna().unique()

# Country code mapping (you can extend this)
country_code_mapping = {
    "US": "US",
    "SG": "SG",
    "MY": "MY",
    "UK": "UK"
}


# Insert missing countries
#countries_to_insert = []
#for country in unique_countries:
#    if country not in existing_countries:
#        if country in country_code_mapping:
#            countries_to_insert.append((country_code_mapping[country], country))

#if countries_to_insert:
#    insert_sql = """
#     INSERT INTO dim_countries (country_code, country_name)
#     VALUES %s
#     ON CONFLICT (country_code) DO NOTHING;
#    """
#    execute_values(cur, insert_sql, countries_to_insert)
#    conn.commit()

# Refresh country list
#existing_countries = get_existing_countries()

In [20]:
# --------------------------------------
# 4. INSERT COMPANIES INTO dim_companies
# --------------------------------------

def clean_company_name(name):
 #   if not isinstance(name, str):
 #       return name
    
    x = name.strip().lower()
  #  print('z',x)
    if "collinwood" in x:
        print('y',x)
    # Remove extra punctuation
    x = re.sub(r"[^a-z0-9\s]", " ", x)

    # Collapse multiple spaces
    x = " ".join(x.split())
    # Optional: remove trailing words like "pte ltd", "ltd", "solutions" if used in lookup rules
    remove_terms = [
        r"\bpte ltd\b",
        r"\bltd\b",
        r"\bprivate limited\b",
        r"\bplc\b",
        r"\bllc\b", 
        r"\binc\b", 
        r"\bltd\b"  
    ]

    for term in remove_terms:
        x = re.sub(term, "", x)
    #fix lefover amp 038
    x=x.replace("amp 038","&")
    # Trim after removal
    x = " ".join(x.split())

    # Title-case after cleaning
    x = x.title()

    return x

company_values = []
for _, row in df.iterrows(): 
    comp = row["Company Name"]
    comp=clean_company_name(comp)
    if "Collinwood" in comp:
        print('x',comp)
    country = row["Country"]
    country_code = existing_countries.get(country)  # may be None
    country_code='US'
    company_values.append((comp, country_code))

company_insert_sql = """
    INSERT INTO dim_companies (company_name, country_code)
    VALUES %s
    ON CONFLICT (company_name) DO NOTHING
    ;
"""
execute_values(cur, company_insert_sql, company_values)
conn.commit()

# Build dictionary: company_name → company_id
cur.execute("SELECT company_id, company_name FROM dim_companies;")
company_lookup = {row[1]: row[0] for row in cur.fetchall()}

y collinwood technology partners
x Collinwood Technology Partners
y collinwood technology partners, llc
x Collinwood Technology Partners


In [21]:
import csv
import re

failed_rows = []
job_rows = []
print(df.head(10))
for idx, row in df.iterrows():

    job_country = row["Country"]

    # --------------------------------------------------------
    # 1) Country Validation
    # --------------------------------------------------------
   
    if job_country not in existing_countries:
        failed_rows.append(row.to_dict())   # FIXED
        continue

    country_code = existing_countries[job_country]
    comp=row["Company Name"]
    comp=clean_company_name(comp)
    print(comp)
    comp_id = company_lookup.get(comp)
    if comp_id is None: 
        print('fail', comp)
        failed_rows.append(row.to_dict())   # FIXED
        continue

    # --------------------------------------------------------
    # 2) Build JobListingID with MIC- prefix
    # --------------------------------------------------------
    job_id = f"MIC_{idx+1:05d}"

    # --------------------------------------------------------
    # 3) Parse Salary Min and Max
    # --------------------------------------------------------
    salary_text = str(row["Salary Estimate"])  # ensure string
    numbers = re.findall(r"\d+", salary_text)

    def safe_int(x):
        try:
            return int(x) * 1000
        except:
            return None

    if len(numbers) >= 2:
        salary_min = safe_int(numbers[0])
        salary_max = safe_int(numbers[1])
    elif len(numbers) == 1:
        salary_min = safe_int(numbers[0])
        salary_max = None
    else:
        salary_min = None
        salary_max = None

    # --------------------------------------------------------
    # 4) Store Valid Job Row
    # --------------------------------------------------------
    job_rows.append((
        job_id,
        comp_id,
        row["Job Title"],
        salary_min,
        salary_max,
        row["Job Description"]
    ))

# --------------------------------------------------------
# 5) Write failed rows
# --------------------------------------------------------
if failed_rows:
    failed_file = "source_failed_country.csv"
    with open(failed_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=df.columns)
        writer.writeheader()
        writer.writerows(failed_rows)

    print(f"{len(failed_rows)} rows skipped (invalid country). Written to {failed_file}")

# --------------------------------------------------------
# 6) Insert into fact_jobs
# --------------------------------------------------------
try:
    job_insert_sql = """
        INSERT INTO fact_jobs (
            JobListingID, company_id, job_title,
            salary_min, salary_max, description
        )
        VALUES %s
        ON CONFLICT (JobListingID) DO NOTHING;
    """
    execute_values(cur, job_insert_sql, job_rows)
    conn.commit()

except Exception as e:
    print("Error inserting into fact_jobs:", e)
    conn.rollback()




                                           Job Title  \
0  Big Data Developer ( Applications Systems Engi...   
1           Project Staff, Consultant, Data Engineer   
2                               Senior Data Engineer   
3  SONUS Engineer/Senior Engineer, IT Network Inf...   
4                                 Ab Initio Engineer   
5                             ADAS Test V&V Engineer   
6                               Advanced AI Engineer   
7                      Advanced Python Data Engineer   
8      Advanced Software Engineer - Server - 20-0100   
9  Advisory Services - Consultant: Data Engineer ...   

               Salary Estimate  \
0  $66K-$122K (Glassdoor est.)   
1  $81K-$141K (Glassdoor est.)   
2  $60K-$109K (Glassdoor est.)   
3  $60K-$109K (Glassdoor est.)   
4  $73K-$126K (Glassdoor est.)   
5  $79K-$104K (Glassdoor est.)   
6   $62K-$92K (Glassdoor est.)   
7  $67K-$126K (Glassdoor est.)   
8  $53K-$100K (Glassdoor est.)   
9  $81K-$100K (Glassdoor est.)   

        