In [1]:
import pandas as pd 
import re

In [2]:
nodeflair = pd.read_csv('nodeflair_data.csv')

In [3]:
nodeflair['technical_skill']

0       ['Mid', 'Container', 'Next.js', 'CI', 'Modular...
1                        ['Senior', 'R', 'SQL', 'Python']
2       ['Junior', 'AWS', 'C++', 'Analytics', 'Storm',...
3               ['Senior', 'Strategy', 'MySQL', 'Python']
4       ['Mid', 'Docker', 'OOP', 'Git', 'Spark', 'SQL'...
                              ...                        
1037    ['Junior', 'Java', 'HTML', 'JavaScript', 'Reac...
1038                    ['Junior', 'QC', 'QA', 'Testing']
1039                                       ['Mid', 'TDD']
1040    ['Mid', 'Senior', 'Lead', 'Powershell', 'C++',...
1041      ['Mid', 'Senior', 'Lead', 'Android, 'IOT', 'C']
Name: technical_skill, Length: 1042, dtype: object

In [4]:
nodeflair.drop_duplicates(subset=['company', 'technical_skill'], keep='first', inplace=True)

In [5]:
nodeflair.info()

<class 'pandas.core.frame.DataFrame'>
Index: 938 entries, 0 to 1041
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            938 non-null    object
 1   company          938 non-null    object
 2   technical_skill  938 non-null    object
 3   image_url        938 non-null    object
 4   job_url          938 non-null    object
dtypes: object(5)
memory usage: 44.0+ KB


In [6]:
def remove_special_chars(skill_list):
    pattern = r'[^a-zA-Z0-9+# ]'
    processed_strings = [re.sub(pattern, '', s).strip() for s in skill_list]
    
    return processed_strings

def split_by_slash(skill_list): 
    new_skills = [] 
    for skill in skill_list:
        new_skills.extend(skill.split('/'))
    
    return new_skills

def lowercase(skill_list):
    new_skills = [] 
    for skill in skill_list:
        new_skills.append(skill.lower())
    
    return new_skills

sub_set = ['senior', 'junior', 'lead', 'mid', 'intern']
def remove_specific_word(skill_list): 
    return [skill for skill in skill_list if skill not in sub_set]

    

In [7]:
nodeflair['technical_skill'] = nodeflair['technical_skill'].apply(lambda x: x.split(', '))
nodeflair['technical_skill'] = nodeflair['technical_skill'].apply(remove_special_chars)  
nodeflair['technical_skill'] = nodeflair['technical_skill'].apply(split_by_slash)
nodeflair['technical_skill'] = nodeflair['technical_skill'].apply(lowercase) 
nodeflair['technical_skill'] = nodeflair['technical_skill'].apply(remove_specific_word)

In [8]:
nodeflair['technical_skill'] 

0       [container, nextjs, ci, modular, scss, webpack...
1                                        [r, sql, python]
2       [aws, c++, analytics, storm, aurora, bigquery,...
3                               [strategy, mysql, python]
4                  [docker, oop, git, spark, sql, python]
                              ...                        
1037    [java, html, javascript, react, django, python...
1038                                    [qc, qa, testing]
1039                                                [tdd]
1040    [powershell, c++, qt, matlab, git, linux, c, n...
1041                                    [android, iot, c]
Name: technical_skill, Length: 938, dtype: object

In [9]:
nodeflair.head()

Unnamed: 0,title,company,technical_skill,image_url,job_url
0,"Frontend Developer (ReactJS, Typescript)",Renesas Electronics,"[container, nextjs, ci, modular, scss, webpack...",https://nodeflair.com/api/v2/companies/715.png,https://nodeflair.com/jobs/renesas-electronics...
1,Senior Data Scientist – ALM,VPBank,"[r, sql, python]",https://nodeflair.com/api/v2/companies/9859.png,https://nodeflair.com/jobs/vpbank-senior-data-...
2,Data Engineer,qode.world,"[aws, c++, analytics, storm, aurora, bigquery,...",https://nodeflair.com/api/v2/companies/22132.png,https://nodeflair.com/jobs/qode-world-data-eng...
3,Senior Software Engineer - Data,qode.world,"[strategy, mysql, python]",https://nodeflair.com/api/v2/companies/22132.png,https://nodeflair.com/jobs/qode-world-senior-s...
4,Data Scientist - Lending Model,Aboitiz Data Innovation,"[docker, oop, git, spark, sql, python]",https://nodeflair.com/api/v2/companies/6176.png,https://nodeflair.com/jobs/aboitiz-data-innova...


In [10]:
nodeflair.insert(loc=2, column='requirement', value=None)
nodeflair.insert(loc=3, column='salary', value='Negotiable')
nodeflair.insert(loc=5, column='location', value='Vietnam')
nodeflair.insert(loc=6, column='working_time', value='Full-time')

In [11]:
nodeflair.head()

Unnamed: 0,title,company,requirement,salary,technical_skill,location,working_time,image_url,job_url
0,"Frontend Developer (ReactJS, Typescript)",Renesas Electronics,,Negotiable,"[container, nextjs, ci, modular, scss, webpack...",Vietnam,Full-time,https://nodeflair.com/api/v2/companies/715.png,https://nodeflair.com/jobs/renesas-electronics...
1,Senior Data Scientist – ALM,VPBank,,Negotiable,"[r, sql, python]",Vietnam,Full-time,https://nodeflair.com/api/v2/companies/9859.png,https://nodeflair.com/jobs/vpbank-senior-data-...
2,Data Engineer,qode.world,,Negotiable,"[aws, c++, analytics, storm, aurora, bigquery,...",Vietnam,Full-time,https://nodeflair.com/api/v2/companies/22132.png,https://nodeflair.com/jobs/qode-world-data-eng...
3,Senior Software Engineer - Data,qode.world,,Negotiable,"[strategy, mysql, python]",Vietnam,Full-time,https://nodeflair.com/api/v2/companies/22132.png,https://nodeflair.com/jobs/qode-world-senior-s...
4,Data Scientist - Lending Model,Aboitiz Data Innovation,,Negotiable,"[docker, oop, git, spark, sql, python]",Vietnam,Full-time,https://nodeflair.com/api/v2/companies/6176.png,https://nodeflair.com/jobs/aboitiz-data-innova...


In [12]:
nodeflair.to_csv('clean_nodeflair.csv', index=False)