# LIBRARIES

In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import csv
from bs4 import BeautifulSoup # For handling the HTML in the dataset
import re

# ABOUT THE DATASET

## Import the dataset

In [None]:
# Function to remove HTML tags
def clean_html(html_text):
    if pd.isna(html_text):  # Handle NaN values
        return html_text
    soup = BeautifulSoup(html_text, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

# Function to remove unwanted phrases
def clean_text(text):
    if pd.isna(text):  # Handle NaN values
        return text
    # List of phrases to remove
    phrases = [
        r'Mô tả Công việc\s*[-:]*\s*',  # Matches "Mô tả Công việc", "Mô tả Công việc -", etc.
        r'Yêu Cầu Công Việc\s*[-:–]*\s*',  # Matches "Yêu Cầu Công Việc", "Yêu Cầu Công Việc –", etc.
        r'Yêu cầu\s*[-:–]*\s*',  # Matches "Yêu cầu", "Yêu cầu:", "Yêu cầu –", etc.
        """
        r'Quyền lợi\s*[-:]*\s*',  # Matches "Quyền lợi", "Quyền lợi:", etc.
        r'Thông tin khác\s*[-:]*\s*',  # Matches "Thông tin khác", etc.
        r'Thời gian, địa điểm làm việc\s*[-:]*\s*',  # Matches "Thời gian, địa điểm làm việc", etc.
        r'Yêu cầu\s*[-:]*\s*',  # Matches "Yêu cầu", "Yêu cầu:", etc.
        r'Địa điểm, thời gian làm việc\s*[-:]*\s*',  # Matches "Địa điểm, thời gian làm việc", etc. 
        """ # ALL OF THESE ARE IN THE OTHER INFO SO WE STILL SOMEWHAT NEED THE TAGS (Sadge)
    ]
    # Combine phrases into a single regex pattern
    pattern = '|'.join(phrases)
    # Remove phrases
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    # Remove bullet point icon (•) and surrounding spaces
    cleaned_text = re.sub(r'\s*•\s*', ' ', cleaned_text)
    # Remove extra whitespace and normalize
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

# Read the CSV file
try:
    df = pd.read_csv('Data/vietnamese-job-posting.csv', 
                     encoding='utf-8', 
                     quoting=csv.QUOTE_ALL, 
                     on_bad_lines='skip')
except UnicodeDecodeError:
    print("Encoding error. Trying with 'latin1' encoding...")
    df = pd.read_csv('Data/vietnamese-job-posting.csv', 
                     encoding='latin1', 
                     quoting=csv.QUOTE_ALL, 
                     on_bad_lines='skip')
except Exception as e:
    print(f"An error occurred: {e}")
    exit()

# Apply HTML cleaning to relevant columns
columns_to_clean = ['job_description', 'job_requirements', 'other_info']
for column in columns_to_clean:
    df[column] = df[column].apply(clean_html)

# Apply text cleaning to remove unwanted phrases
for column in columns_to_clean:
    df[column] = df[column].apply(clean_text)

# Dropping unecessary/complex info (Other info would require NLP manipulation to resolve (ON HOLD))
columns_to_drop = ['company_video_url', 'company_url', 'job_url', 'other_info']
for column in columns_to_drop:
    df = df.drop(column, axis=1) 
    
# Display the first few rows to verify
print("First 5 Rows After Cleaning:")
print(df.head())

  """


First 5 Rows After Cleaning:
                                           job_title    job_id  \
0                              Performance Marketing  35BB6CB4   
1                   Organization Development Manager  35BB6E66   
2                  Thiết kế đồ họa- Graphic Designer  35BB6CF1   
3  [HCM-Phú Nhuận] Chuyên viên kinh doanh mảng ph...  35BB6E1A   
4                     Nhân Viên Kinh Doanh Phân Phối  35BB6CBF   

                     company_title                    salary  \
0  Công ty Cổ phần Thời Trang YODY  Lương: 15 Tr - 25 Tr VND   
1  Công ty Cổ phần Thời Trang YODY  Lương: 30 Tr - 40 Tr VND   
2          Công Ty TNHH TM AN SINH  Lương: 12 Tr - 20 Tr VND   
3                          Bảo mật  Lương: 10 Tr - 15 Tr VND   
4          Công Ty TNHH TM AN SINH         Lương: Cạnh tranh   

               location                        outstanding_welfare  \
0  Hà Nội | Hồ Chí Minh         Laptop | Chế độ bảo hiểm | Du Lịch   
1    Hải Dương | Hà Nội         Laptop | Chế độ b

In [54]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('Data/cleaned_vietnamese_job_posting.csv', index=False, encoding='utf-8')

# Display basic info about the Dataset
print("\nDataFrame Info:")
print(df.info())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2056 entries, 0 to 2055
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_title            2056 non-null   object
 1   job_id               2056 non-null   object
 2   company_title        2056 non-null   object
 3   salary               2056 non-null   object
 4   location             2056 non-null   object
 5   outstanding_welfare  1991 non-null   object
 6   announcement_date    2056 non-null   object
 7   category             1945 non-null   object
 8   position             1945 non-null   object
 9   exp                  1945 non-null   object
 10  order                1945 non-null   object
 11  expiration_date      1697 non-null   object
 12  detailed_welfare     1896 non-null   object
 13  job_description      2038 non-null   object
 14  job_requirements     2038 non-null   object
 15  job_tags             1554 non-null   o

In [55]:
# Checking if there're missing data
print(df.isnull().sum())

# Drop missing rows (the data is too complicated to fill n without extensive research (ON HOLD))
df = df.dropna()

# Removing duplicates (if any at all, unlikely in a dataset as chaotic as this)
df.drop_duplicates(inplace=True)

print(df.isnull().sum())

job_title                0
job_id                   0
company_title            0
salary                   0
location                 0
outstanding_welfare     65
announcement_date        0
category               111
position               111
exp                    111
order                  111
expiration_date        359
detailed_welfare       160
job_description         18
job_requirements        18
job_tags               502
dtype: int64
job_title              0
job_id                 0
company_title          0
salary                 0
location               0
outstanding_welfare    0
announcement_date      0
category               0
position               0
exp                    0
order                  0
expiration_date        0
detailed_welfare       0
job_description        0
job_requirements       0
job_tags               0
dtype: int64


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1289 entries, 1 to 2054
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_title            1289 non-null   object
 1   job_id               1289 non-null   object
 2   company_title        1289 non-null   object
 3   salary               1289 non-null   object
 4   location             1289 non-null   object
 5   outstanding_welfare  1289 non-null   object
 6   announcement_date    1289 non-null   object
 7   category             1289 non-null   object
 8   position             1289 non-null   object
 9   exp                  1289 non-null   object
 10  order                1289 non-null   object
 11  expiration_date      1289 non-null   object
 12  detailed_welfare     1289 non-null   object
 13  job_description      1289 non-null   object
 14  job_requirements     1289 non-null   object
 15  job_tags             1289 non-null   object
dtypes: object(1