In [1]:
import pandas as pd 
import re

In [2]:
topcv = pd.read_csv(r'topcv_translated.csv')

In [3]:
topcv.columns

Index(['job_title', 'company_name', 'requirement', 'salary', 'location',
       'image_link', 'job_url'],
      dtype='object')

In [4]:
# Đổi tên cột
topcv.rename(columns={'job_title': 'title',
                        'company_name': 'company',  
                        'image_link' : 'image_url'},
                        inplace=True)

In [5]:
# Đổi lại thứ tự các dòng
desired_columns_order = ['title', 'company', 'requirement', 'salary', 'location', 'image_url', 'job_url']
topcv = topcv.reindex(columns=desired_columns_order)
topcv.insert(loc=4, column='technical_skill', value=None)
topcv.insert(loc=6, column='working_time', value='Fulltime')

In [6]:
# Xử lý technical skill cho topcv 
def split_sentences_in_each_req(x):
    x = x.replace('\\n', '\n')
    sentences = x.split('\n')
    return sentences

In [7]:
def sentences_preprocessing(sen):
    res = re.sub(r'\d+|[^\w\s,./!#]', '', sen)
    res = " ".join(res.split())
    return res

In [8]:
def get_technical_skill_from_sentences(sentence):
    words = sentence.split()
    uppercase_words = []

    i = 1  
    while i < len(words):
        if words[i][0].isupper():
            if i + 2 < len(words) and words[i + 1][0].isupper() and words[i + 2][0].isupper() and words[i][-1] != ',' and words[i][-1] != '.' and words[i + 1][-1] != ',' and words[i + 1][-1] != '.':
                uppercase_words.append(words[i] + ' ' + words[i + 1] + ' ' + words[i + 2])
                i += 3

            elif i + 1 < len(words) and words[i + 1][0].isupper() and words[i][-1] != ',' and words[i][-1] != '.':
                uppercase_words.append(words[i] + ' ' + words[i + 1])
                i += 2
            
            else:
                uppercase_words.append(words[i])
                i += 1
        else:
            i += 1  

    return uppercase_words

In [9]:
def get_technical_skills(X : str): 
    skill = []
    tech_sen = split_sentences_in_each_req(X)

    for sen in tech_sen:
        tech_sen_preprocessed = sentences_preprocessing(sen)
        tech_skill = get_technical_skill_from_sentences(tech_sen_preprocessed)
        skill.extend(tech_skill)

    return skill 

In [10]:
topcv_req = topcv['requirement']

In [11]:
topcv['technical_skill'] = topcv_req.apply(get_technical_skills)

In [12]:
topcv['technical_skill']

0       [College, Information Technology,, Computer Sc...
1       [React.js, Computer Science,, Information Tech...
2       [Degree, IT, ReactJS, VueJS, Typescript ES, CS...
3                                   [Unity Having, Unity]
4          [Javascript, Typescript, Cocos Creator/ Unity]
                              ...                        
1840    [Information Technology,, Computer Science,, I...
1841    [IT, WEB, PHP/HTML/CSS/JAVASCRIPT., MVC Framew...
1842    [Information Technology,, Computer Science,, I...
1843    [IT,, Computer Science,, Mathematics..., Java,...
1844    [Backend, Java, SQL,, PL/SQL,, Oracle/MySQL,, ...
Name: technical_skill, Length: 1845, dtype: object

In [13]:
def split_by_slash(skill_list): 
    new_skills = [] 
    for skill in skill_list:
        new_skills.extend(skill.split('/'))
    
    return new_skills

def split_by_space(skill_list):
    new_skills = [] 
    for skill in skill_list:
        new_skills.extend(skill.split())
    
    return new_skills

def lowercase(skill_list):
    new_skills = [] 
    for skill in skill_list:
        new_skills.append(skill.lower())
    
    return new_skills

def remove_special_chars(skill_list):
    pattern = r'[^a-zA-Z0-9+# ]'
    processed_strings = [re.sub(pattern, '', s).strip() for s in skill_list]
    
    return processed_strings

def to_set(skill_list):
    return list(set(skill_list))

In [14]:
topcv['technical_skill'] = topcv['technical_skill'].apply(split_by_slash)
topcv['technical_skill'] = topcv['technical_skill'].apply(split_by_space)
topcv['technical_skill'] = topcv['technical_skill'].apply(remove_special_chars)
topcv['technical_skill'] = topcv['technical_skill'].apply(lowercase)
topcv['technical_skill'] = topcv['technical_skill'].apply(to_set)

In [24]:
import csv

def load_words_from_csv(csv_file):
    words_set = set()
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:  
                word = row[0].strip()
                words_set.add(word)
    return words_set

# Load words from CSV into a set
csv_file_path = r'..\techcical_list.csv'  
words_set = load_words_from_csv(csv_file_path)

def filter_words(word_list):
    return [word for word in word_list if word in words_set]

In [25]:
words_set = load_words_from_csv(csv_file_path)

def fill_again(skill_list):
    filled_list = [word for word in skill_list if word in words_set]
    return filled_list 

In [26]:
topcv['technical_skill'] = topcv['technical_skill'].apply(fill_again)

In [27]:
topcv['technical_skill']

0       [html, css, postgresql, mongodb, javascript, j...
1       [git, react, agile, nextjs, ui, reactjs, ci, v...
2       [typescript, figma, api, github, ui, reactjs, ...
3                                                 [unity]
4                         [typescript, unity, javascript]
                              ...                        
1840    [html, css, mvc, nosql, web, mongodb, javascri...
1841    [css, html, mvc, nosql, web, mongodb, javascri...
1842    [html, css, mvc, nosql, web, mongodb, javascri...
1843    [git, linux, elasticsearch, integration, postg...
1844          [sql, agile, pl, oracle, java, mysql, cmmi]
Name: technical_skill, Length: 1845, dtype: object

In [29]:
def convert_salary(x):
    if x == 'The deal': 
        return 'Negotiable'
    else:
        return x

topcv['salary'] = topcv['salary'].apply(convert_salary)

In [30]:
topcv.head()

Unnamed: 0,title,company,requirement,salary,technical_skill,location,working_time,image_url,job_url
0,Developer - Zalo,Công ty TNHH Phân Phối Trực Tuyến Nhất Nhất,['Graduate from College or higher in Informati...,Negotiable,"[html, css, postgresql, mongodb, javascript, j...","- Hanoi: 8th Floor, Dong Loi Building, No. 2 L...",Fulltime,https://cdn-new.topcv.vn/unsafe/80x/https://st...,https://www.topcv.vn/viec-lam/lap-trinh-vien-d...
1,"Front-End Developer, Salary from 25-50 Million...",TNHH APPTIFACT,"[""At least 4 years of experience in web develo...",25 - 50 million,"[git, react, agile, nextjs, ui, reactjs, ci, v...","- Ho Chi Minh: Sarina Block A Building, Street...",Fulltime,https://www.topcv.vn/v4/image/normal-company/l...,https://www.topcv.vn/viec-lam/front-end-develo...
2,Front-End Developer 3 Years of Experience (Sal...,CÔNG TY TNHH TIMIND,"[""Bachelor's Degree (or above) in IT or relate...",20 - 30 million,"[typescript, figma, api, github, ui, reactjs, ...",#NAME?,Fulltime,https://cdn-new.topcv.vn/unsafe/80x/https://st...,https://www.topcv.vn/viec-lam/front-end-develo...
3,Game Programming - Unity Developer (Intern/Fre...,Công ty Cổ phần Công nghệ và Sáng tạo Rocket,['Fresher requires a minimum of 6 months of ex...,Up to 12 million,[unity],"- Hanoi: 8th floor, Gold Tower, 275 Nguyen Tra...",Fulltime,https://cdn-new.topcv.vn/unsafe/80x/https://st...,https://www.topcv.vn/viec-lam/lap-trinh-game-u...
4,Game Developer (Cocos/ Unity) - Salary Upto 35...,XIPAT FLEXIBLE SOLUTIONS COMPANY LIMITED,['Experience in developing mobile games prefer...,Up to 35 million,"[typescript, unity, javascript]","- Hanoi: November 18, Thai Ha, Dong Da",Fulltime,https://cdn-new.topcv.vn/unsafe/80x/https://st...,https://www.topcv.vn/viec-lam/game-developer-c...


In [32]:
topcv.to_csv('clean_topcv.csv', index = False)

In [2]:
import pandas as pd 
df = pd.read_csv('clean_topcv.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1845 entries, 0 to 1844
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            1845 non-null   object
 1   company          1845 non-null   object
 2   requirement      1845 non-null   object
 3   salary           1845 non-null   object
 4   technical_skill  1845 non-null   object
 5   location         1845 non-null   object
 6   working_time     1845 non-null   object
 7   image_url        1845 non-null   object
 8   job_url          1845 non-null   object
dtypes: object(9)
memory usage: 129.9+ KB
