In [85]:
import pandas as pd 
import re

In [86]:
it_viec = pd.read_csv(r'itviec_data_translated.csv')

In [87]:
# Đổi tên cột
it_viec.rename(columns={'Title': 'title',
                        'Company_name': 'company',  
                        'Address' : 'location', 
                        'Contract_type' : 'working_time', 
                        'Requirement': 'requirement', 
                        'Benefit' : 'salary', 
                        'Technical_skills': 'technical_skill', 
                        'Link_image' : 'image_url', 
                        'URL' : 'job_url'},
                        inplace=True)

In [88]:
# Đổi lại thứ tự các dòng
desired_columns_order = ['title', 'company', 'requirement', 'salary', 'technical_skill', 'location', 'working_time', 'image_url', 'job_url']
it_viec = it_viec.reindex(columns=desired_columns_order)

In [89]:
# Xử lý technical skill 
def split_sentences_in_each_req(x):
    # x = x.replace('\\n', '\n')
    sentences = x.split(',')
    return sentences

In [90]:
def sentences_preprocessing(sen):
    res = re.sub(r'\d+|[^\w\s,./!#]', '', sen)
    res = " ".join(res.split())
    return res

In [91]:
def get_technical_skill_from_sentences(sentence):
    words = sentence.split()
    uppercase_words = []

    i = 1  
    while i < len(words):
        if words[i][0].isupper():
            if i + 2 < len(words) and words[i + 1][0].isupper() and words[i + 2][0].isupper() and words[i][-1] != ',' and words[i][-1] != '.' and words[i + 1][-1] != ',' and words[i + 1][-1] != '.':
                uppercase_words.append(words[i] + ' ' + words[i + 1] + ' ' + words[i + 2])
                i += 3

            elif i + 1 < len(words) and words[i + 1][0].isupper() and words[i][-1] != ',' and words[i][-1] != '.':
                uppercase_words.append(words[i] + ' ' + words[i + 1])
                i += 2
            
            else:
                uppercase_words.append(words[i])
                i += 1
        else:
            i += 1  

    return uppercase_words

In [92]:
def get_technical_skills(X : str): 
    skill = []
    tech_sen = split_sentences_in_each_req(X)

    for sen in tech_sen:
        tech_sen_preprocessed = sentences_preprocessing(sen)
        tech_skill = get_technical_skill_from_sentences(tech_sen_preprocessed)
        skill.extend(tech_skill)

    return skill 
  

In [93]:
df_none = it_viec[it_viec['technical_skill'] == '[]'].copy()
condition = it_viec['technical_skill'].apply(lambda x: x == '[]')
it_viec_clean = it_viec[~condition].copy()

In [94]:
df_none_req = df_none['requirement']

In [95]:
df_none['technical_skill'] = df_none['requirement'].apply(get_technical_skills)

In [96]:
df_none['technical_skill']

18     [Playwright, Katalon, Java, API, TestNG/Mocha/...
38     [Game, Hyper, D, Experience, D, Artists, Devel...
39     [Wacom, Spine, Photoshop, Jira, Pipeline, Engl...
47     [Project Manager, English, Vietnamese., Agile,...
48                                   [English, Web, Web]
                             ...                        
728    [Agile, Waterfall, IT, ERP, Cases, Project Man...
730    [DevOps, System Engineer/System Administrator,...
732                                                   []
734    [Degree, Information Technology, Science, Reac...
735    [Engineering, Science, ExpertisenA Minimum, Sk...
Name: technical_skill, Length: 130, dtype: object

In [97]:
it_viec_new = pd.concat([df_none, it_viec_clean])

In [98]:
condition = it_viec_new['technical_skill'].apply(lambda x: x == [])
it_viec_clean_new = it_viec_new[~condition].copy()

In [99]:
it_viec_clean_new.shape

(707, 9)

In [100]:
it_viec_clean_new.to_csv('demo.csv', index = False)

In [101]:
nearly_clean = pd.read_csv(r'demo.csv')

In [102]:
print(type(nearly_clean['technical_skill'][0]))

<class 'str'>


In [103]:
def split_by_skill(skill_string):
    cleaned_string = skill_string.strip("[]").replace("'", "")
    skill_list = [s.strip() for s in cleaned_string.split(',')]

    return skill_list


def split_by_slash(skill_list): 
    new_skills = [] 
    for skill in skill_list:
        new_skills.extend(skill.split('/'))
    
    return new_skills

def split_by_space(skill_list):
    new_skills = [] 
    for skill in skill_list:
        new_skills.extend(skill.split())
    
    return new_skills

def remove_special_chars(skill_list):
    pattern = r'[^a-zA-Z0-9+# ]'
    processed_strings = [re.sub(pattern, '', s).strip() for s in skill_list]
    
    return processed_strings

def lowercase(skill_list):
    new_skills = [] 
    for skill in skill_list:
        new_skills.append(skill.lower())
    
    return new_skills


def to_set(skill_list):
    return list(set(skill_list))

In [104]:
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(split_by_skill)
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(split_by_slash)
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(split_by_space)
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(remove_special_chars)
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(lowercase)
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(to_set)

In [105]:
import csv

def load_words_from_csv(csv_file):
    words_set = set()
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:  
                word = row[0].strip()
                words_set.add(word)
    return words_set

# Load words from CSV into a set
csv_file_path = r'..\techcical_list.csv'  
words_set = load_words_from_csv(csv_file_path)

def filter_words(word_list):
    return [word for word in word_list if word in words_set]

In [106]:
words_set = load_words_from_csv(csv_file_path)

def fill_again(skill_list):
    filled_list = [word for word in skill_list if word in words_set]
    return filled_list 

In [107]:
nearly_clean['technical_skill'] = nearly_clean['technical_skill'].apply(fill_again)

In [108]:
nearly_clean['technical_skill']

0      [api, katalon, testng, java, mocha, jasmine, n...
1                               [game, unity, balancing]
2                                      [photoshop, jira]
3                                [project, scrum, agile]
4                                                  [web]
                             ...                        
702                             [database, python, java]
703                         [reactjs, javascript, vuejs]
704                                       [linux, cloud]
705                           [agile, analyst, business]
706                          [analyst, tester, business]
Name: technical_skill, Length: 707, dtype: object

In [109]:
nearly_clean.to_csv('clean_it_viec.csv', index = False)

In [1]:
import pandas as pd 
df = pd.read_csv('clean_it_viec.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            707 non-null    object
 1   company          707 non-null    object
 2   requirement      707 non-null    object
 3   salary           707 non-null    object
 4   technical_skill  707 non-null    object
 5   location         707 non-null    object
 6   working_time     707 non-null    object
 7   image_url        707 non-null    object
 8   job_url          707 non-null    object
dtypes: object(9)
memory usage: 49.8+ KB
