In [2]:
import pandas as pd

import numpy as np
 
# Creating a sample dataset

data = {'order_id': np.arange(1, 1001)}  # order_id from 1 to 1000

df = pd.DataFrame(data)
 
# Check the default data type

print(df['order_id'].dtype)  # This will show int64

print(f"Memory Usage: {df.memory_usage(deep=True)}")

# Convert order_id to int16 (smallest type that fits)

df['order_id'] = df['order_id'].astype(np.int16)
 
# Check the memory usage

print(f"Memory Usage: {df.memory_usage(deep=True)}")
 
#Memory Saving: The int16 takes only 2 bytes per value, whereas int64 takes 8 bytes.
 
# Creating a dataset with float values

data = {'sales_amount': np.random.random(1000000)}  # Random float values

df = pd.DataFrame(data)
 
# Check the default data type

print(df['sales_amount'].dtype)  # This will show float64
 
# Convert to float32 for memory optimization

df['sales_amount'] = df['sales_amount'].astype(np.float32)
 
# Check the memory usage

print(f"Memory Usage: {df.memory_usage(deep=True)}")
 
#float32 takes 4 bytes compared to 8 bytes for float64.


int32
Memory Usage: Index        132
order_id    4000
dtype: int64
Memory Usage: Index        132
order_id    2000
dtype: int64
float64
Memory Usage: Index               132
sales_amount    4000000
dtype: int64


In [4]:
import pandas as pd
 # Creating a dataset with repeated string values
data = {'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles'] * 100000}
df = pd.DataFrame(data)
 
# Check the memory usage with object type
print(df['city'].dtype)  # This will show object (i.e., string)
 
# Convert to category type
df['city'] = df['city'].astype('category')
 
# Check the memory usage after conversion
print(f"Memory Usage: {df.memory_usage(deep=True)}")
#1. Deletion
#Listwise Deletion: Remove rows with any missing values.
data_cleaned = df.dropna()

object
Memory Usage: Index       132
city     500305
dtype: int64


In [17]:
df=pd.read_csv('data job posts.csv')
df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True


In [26]:
print(df.isnull().sum())

jobpost                 0
date                    0
Title                  28
Company                 7
AnnouncementCode    17793
Term                11325
Eligibility         14071
Audience            18361
StartDate            9326
Duration             8203
Location               32
JobDescription       3892
JobRequirment        2522
RequiredQual          484
Salary               9379
ApplicationP           60
OpeningDate           706
Deadline               65
Notes               16790
AboutC               6531
Attach              17442
Year                    0
Month                   0
IT                      0
dtype: int64


In [29]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT


In [32]:
#  Handling Missing Values
print(df.isnull().sum())

# Fill missing values: mode for categorical, median for numerical
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical with mode

for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)  # Fill numeric with median

# Drop irrelevant columns if they contain excessive noise
df.drop(columns=["Attach", "Notes"], errors='ignore', inplace=True)
df

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,JobRequirment,RequiredQual,Salary,ApplicationP,OpeningDate,Deadline,AboutC,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,Long term,...,- Supervises financial management and administ...,"To perform this job successfully, an\r\nindivi...",Competitive,"To apply for this position, please submit a\r\...",17 July 2009,26 January 2004,Mentor Graphics Development Services CJSC is a...,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,3 months,...,- Participate in application design;\r\n- Prov...,- Bachelor's Degree; Master's is preferred;\r\...,Competitive,Please submit a cover letter and resume to:\r\...,17 July 2009,12 January 2004,The International Research & Exchanges Board (...,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,Renewable annual contract\r\nPOSITION,...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ...",Competitive,Please send resume or CV toursula.kazarian@......,17 July 2009,20 January 2004\r\nSTART DATE: February 2004,The Caucasus Environmental NGO Network is a\r\...,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,Long term,...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci...",Competitive,Please send cover letter and resume to Amy\r\n...,17 July 2009,23 January 2004\r\nSTART DATE: Immediate,Mentor Graphics Development Services CJSC is a...,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,Long term,...,- Rendering technical assistance to Database M...,- University degree; economical background is ...,Competitive,Successful candidates should submit\r\n- CV; \...,17 July 2009,"20 January 2004, 18:00",Mentor Graphics Development Services CJSC is a...,2004,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,IOS - 001,Full-time,All qualified candidates,All interested candidates,ASAP,Long-term,...,- Work closely with product and business teams...,- At least 5 years of experience in Interface/...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,As a company Technolinguistics has a mandate t...,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",IOS - 001,Full-time,All interested professionals.,All interested candidates,ASAP,Long-term with a probation period of 3 months.,...,- Establish and manage Category Management dev...,"- University degree, ideally business related;...",Competitive,All interested candidates are kindly requested...,30 December 2015,20 January 2016,Mentor Graphics Development Services CJSC is a...,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",IOS - 001,Full-time,All interested professionals.,All interested candidates,ASAP,Long-term with a probation period of 3 months.,...,"- Develop, establish and maintain marketing st...","- Degree in Business, Marketing or a related f...",Competitive,All interested candidates are kindly requested...,30 December 2015,20 January 2016,Mentor Graphics Development Services CJSC is a...,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,IOS - 001,Full time,All qualified candidates,All interested candidates,ASAP,Long-term,...,- Handle the project activites of the online s...,- At least 1 year of experience in online sale...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,San Lazzaro LLC works with several internation...,2015,12,False


In [33]:
pip install pandas numpy beautifulsoup4 nltk

Note: you may need to restart the kernel to use updated packages.


In [34]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from datetime import datetime

# Download required NLTK data
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cvr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [36]:
# String Operations & Text Processing
#(a) Clean JobDescription & JobRequirement
def clean_text(text):
    if pd.isna(text):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    text = text.lower()  # Convert to lowercase
    return text

df["JobD"] = df["JobDescription"].apply(clean_text)
df["JobRequirment"] = df["JobRequirment"].apply(clean_text)


  text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags


In [37]:
# Extract Key Skills from JobDescription
skills = ["python", "java", "sql", "machine learning", "data science", "excel", "aws", "cloud"]

def extract_skills(text):
    tokens = word_tokenize(text)
    found_skills = [skill for skill in skills if skill in tokens]
    return ", ".join(found_skills)

df["KeySkills"] = df["JobDescription"].apply(extract_skills)


In [38]:
# Normalize Title & Standardize Term
title_mapping = {
    "software eng.": "Software Engineer",
    "dev": "Developer",
    "full stack dev": "Full Stack Developer"
}

df["Title"] = df["Title"].str.lower().replace(title_mapping)

df["Term"] = df["Term"].str.replace(r"ft|full time", "Full-time", regex=True)
df["Term"] = df["Term"].str.replace(r"pt|part time", "Part-time", regex=True)


In [40]:
#Date operations
# Convert date columns to datetime
date_cols = ["date", "StartDate", "OpeningDate", "Deadline"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")

# Calculate Days Left to Apply
df["DaysLeftToApply"] = (df["Deadline"] - df["OpeningDate"]).dt.days

# Calculate Job Start Delay
df["JobStartDelay"] = (df["StartDate"] - df["OpeningDate"]).dt.days

# Extract Year and Month
df["OpeningYear"] = df["OpeningDate"].dt.year
df["OpeningMonth"] = df["OpeningDate"].dt.month


In [41]:
# Splitting & Extracting Data
#(a) Split Location into City & Country
df[["City", "Country"]] = df["Location"].str.split(",", expand=True, n=1)


In [42]:
#Extract Numeric Salary Range
def extract_salary(salary_text):
    if pd.isna(salary_text):
        return pd.Series([None, None])
    
    numbers = re.findall(r"\d+", salary_text)  # Extract numbers
    if len(numbers) >= 2:
        return pd.Series([int(numbers[0]) * 1000, int(numbers[1]) * 1000])  # Convert "50K" to "50000"
    elif len(numbers) == 1:
        return pd.Series([int(numbers[0]) * 1000, None])
    return pd.Series([None, None])

df[["MinSalary", "MaxSalary"]] = df["Salary"].apply(extract_salary)


In [43]:
# Filtering & Categorization
#(a) Identify IT-Related Jobs
df["IsITJob"] = df["IT"].apply(lambda x: 1 if str(x).lower() in ["yes", "it", "tech"] else 0)


In [44]:
# Categorize Eligibility & Audience into Structured Education Level
def categorize_education(education):
    education = str(education).lower()
    if "phd" in education:
        return "PhD"
    elif "master" in education or "mba" in education:
        return "Master's"
    elif "bachelor" in education or "bsc" in education:
        return "Bachelor's"
    elif "diploma" in education:
        return "Diploma"
    else:
        return "Other"

df["EducationLevel"] = df["Eligibility"].apply(categorize_education)


In [45]:
df


Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,DaysLeftToApply,JobStartDelay,OpeningYear,OpeningMonth,City,Country,MinSalary,MaxSalary,IsITJob,EducationLevel
0,AMERIA Investment Consulting Company\r\nJOB TI...,2004-01-05,chief financial officer,AMERIA Investment Consulting Company,IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,Long term,...,-1999.0,,2009.0,7.0,Yerevan,Armenia,,,0,Other
1,International Research & Exchanges Board (IREX...,2004-01-07,full-time community connections intern (paid i...,International Research & Exchanges Board (IREX),IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,3 months,...,-2013.0,,2009.0,7.0,IREX Armenia Main Office; Yerevan,Armenia \r\nDESCRIPTION: IREX currently see...,,,0,Other
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,2004-01-07,country coordinator,Caucasus Environmental NGO Network (CENN),IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,Renewable annual contract\r\nPOSITION,...,,,2009.0,7.0,Yerevan,Armenia,,,0,Other
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,2004-01-07,bcc specialist,Manoff Group,IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,Long term,...,,,2009.0,7.0,Manila,Philippines,,,0,Other
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,2004-01-10,software developer,Yerevan Brandy Company,IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,Long term,...,,,2009.0,7.0,Yerevan,Armenia,,,0,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,2015-12-28,senior creative ux/ ui designer,Technolinguistics NGO,IOS - 001,Full-time,All qualified candidates,All interested candidates,NaT,Long-term,...,30.0,,2015.0,12.0,Yerevan,Armenia,,,0,Other
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",2015-12-30,category development manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",IOS - 001,Full-time,All interested professionals.,All interested candidates,NaT,Long-term with a probation period of 3 months.,...,21.0,,2015.0,12.0,Yerevan,Armenia,,,0,Other
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",2015-12-30,operational marketing manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",IOS - 001,Full-time,All interested professionals.,All interested candidates,NaT,Long-term with a probation period of 3 months.,...,21.0,,2015.0,12.0,Yerevan,Armenia,,,0,Other
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,2015-12-30,head of online sales department,San Lazzaro LLC,IOS - 001,Full time,All qualified candidates,All interested candidates,NaT,Long-term,...,30.0,,2015.0,12.0,Yerevan,Armenia,,,0,Other


In [46]:
df.to_csv("cleaned_job_postings.csv", index=False)
