In [8]:
import re, nltk, spacy, gensim, os
from bs4 import BeautifulSoup
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
from nltk.corpus import stopwords
from string import punctuation
import pandas as pd
import emoji
import re
from nltk import ngrams


from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer



In [2]:
top_tags = []

def list_directories(path):
    """list files and directories in a given path"""
    arr = os.listdir(path)
    return arr


def clean_text(text):
    ''' Lowering text and removing undesirable marks
    '''
    
    text = text.lower()
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text) # matches all whitespace characters
    text = text.strip(' ')
    return text
    

token = ToktokTokenizer()
punct = punctuation
    
def strip_list_noempty(mylist):
    
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']
    
    
def clean_punct(text): 
    ''' Remove punctuations'''
    
    words = token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    
    for w in words:
        if w in top_tags:
            punctuation_filtered.append(w)
        else:
            w = re.sub('^[0-9]*', " ", w)
            punctuation_filtered.append(regex.sub('', w))
  
    filtered_list = strip_list_noempty(punctuation_filtered)
        
    return ' '.join(map(str, filtered_list))


stop_words = set(stopwords.words("english"))

def stopWordsRemove(text):
    ''' Removing all the english stop words from a corpus
    Parameter:
    text: corpus to remove stop words from it
    '''

    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    
    return ' '.join(map(str, filtered))
    
    
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags, stop_words=stop_words):
    ''' It keeps the lemma of the words (lemma is the uninflected form of a word),
    and deletes the underired POS tags
    
    Parameters:
    
    texts (list): text to lemmatize
    allowed_postags (list): list of allowed postags, like NOUN, ADL, VERB, ADV
    '''
    lemma = wordnet.WordNetLemmatizer()       
    doc = nlp(texts) 
    texts_out = []
    
    for token in doc:
        
        if str(token) in top_tags:
            texts_out.append(str(token))
            
        elif token.pos_ in allowed_postags:
            
            if token.lemma_ not in ['-PRON-']:
                texts_out.append(token.lemma_)
                
            else:
                texts_out.append('')
     
    texts_out = ' '.join(texts_out)

    return texts_out
    
    
def strip_emoji(text):
#     print(emoji.emoji_count(text))
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text



def preprocess_text(df,column='description'):
    
    df[column] = df[column].apply(lambda x: clean_text(x))
    df[column] = df[column].apply(lambda x:  BeautifulSoup(x).get_text())
    df[column] = df[column].apply(lambda x: strip_emoji(x))

    df[column] = df[column].apply(lambda x: clean_punct(x)) 
    df[column] = df[column].apply(lambda x: stopWordsRemove(x)) 
    df[column] = df[column].apply(lambda x: lemmatization(x, ['NOUN', 'ADV']))

    return df

In [9]:

def generate_tags(df,column='description'):
    
    countVec =CountVectorizer(ngram_range=(1,5))
    cv = countVec.fit_transform(df[column].str.lower())
    cv_feature_names = countVec.get_feature_names()
    feature_count = cv.toarray().sum(axis = 0)
    feature_count = sorted(dict(zip(cv_feature_names, feature_count)).items(), key=lambda item: item[1])[::-1][:50]

    return feature_count

In [10]:
files = list_directories('data')
files

['Communication_Disorders_Sciences_67_records.csv',
 'Neuroscience_170_records.csv',
 'Human_Resources-Personnel_Management_24_records.csv',
 'Computer_Science_71_records.csv',
 'Communication_Technologies_62_records.csv',
 'Accounting_25_records.csv',
 'Criminal_Justice___Fire_Protection_66_records.csv',
 'Mathematics_173_records.csv',
 'Computer_Programming___Data_Processing_69_records.csv',
 'Materials_Engineering_&_Materials_Science_170_records.csv',
 'Commercial_Art___Graphic_Design_69_records.csv',
 'Community___Public_Health_61_records.csv',
 'Composition___Rhetoric_65_records.csv',
 'Psychologist_75_records.csv',
 'Computer_Science_74_records.csv',
 'Mechanical_Engineering_175_records.csv',
 'Computer_Engineering_69_records.csv',
 'Marketing_&_Marketing_Research_25_records.csv',
 'Construction_Services_71_records.csv',
 'Computer_Networking___Telecommunications_67_records.csv',
 'Operations_Logistics_&_E-Commerce_172_records.csv',
 'Pre-Law_&_Legal_Studies_169_records.csv',
 'E

In [11]:
category = []
for i in files:
    t = ''.join(i.split('_')[:-2])
    category.append(t)
category

['CommunicationDisordersSciences',
 'Neuroscience',
 'HumanResources-PersonnelManagement',
 'ComputerScience',
 'CommunicationTechnologies',
 'Accounting',
 'CriminalJusticeFireProtection',
 'Mathematics',
 'ComputerProgrammingDataProcessing',
 'MaterialsEngineering&MaterialsScience',
 'CommercialArtGraphicDesign',
 'CommunityPublicHealth',
 'CompositionRhetoric',
 'Psychologist',
 'ComputerScience',
 'MechanicalEngineering',
 'ComputerEngineering',
 'Marketing&MarketingResearch',
 'ConstructionServices',
 'ComputerNetworkingTelecommunications',
 'OperationsLogistics&E-Commerce',
 'Pre-Law&LegalStudies',
 'EarlyChildhoodEducation',
 'ComputerInformationSystems',
 'ClinicalPsychology',
 'Statistics&DecisionScience',
 'ManagementInformationSystems',
 'PublicAdministration',
 'ComputerManagementSecurity']

In [12]:
data = []
for cat, file in zip(category,files):
    df = pd.read_csv(f"data/{file}") 

    df = preprocess_text(df,column='description')
    df = preprocess_text(df,column='job-title')
    tags1 = generate_tags(df,column='description')
    tags2 = generate_tags(df,column='job-title')
    
    data.append([ cat, file ,tags1,tags2 ])


In [13]:
print('###### title Based ########','\n')    
for j in data:
    print('\n',j[0],'----->',j[1],'\n')
    print(j[3],'\n')   

###### title Based ######## 


 CommunicationDisordersSciences -----> Communication_Disorders_Sciences_67_records.csv 

[('speech', 11), ('pathologist', 10), ('disorder', 10), ('communication', 9), ('language', 8), ('professor', 7), ('director', 7), ('speech language', 6), ('science', 6), ('ihp', 6), ('assistant', 6), ('speech pathologist', 5), ('program', 5), ('pathology', 5), ('faculty', 5), ('communication science', 5), ('track', 4), ('therapy', 4), ('speech language pathologist', 4), ('science disorder', 4), ('research', 4), ('language pathologist', 4), ('fulltime', 4), ('education', 4), ('director education', 4), ('talent acquisition partner', 3), ('talent acquisition', 3), ('talent', 3), ('staff', 3), ('speechlanguage', 3), ('partner', 3), ('nontenure track', 3), ('nontenure', 3), ('language pathology', 3), ('ihp staff', 3), ('health', 3), ('communication science disorder', 3), ('communication disorder', 3), ('assistant professor', 3), ('assistant ihp', 3), ('amp disorder', 3), (

In [14]:
print('###### description Based ########','\n')    
for j in data:
    print('\n',j[0],'----->',j[1],'\n')
    print(j[2],'\n')    


###### description Based ######## 


 CommunicationDisordersSciences -----> Communication_Disorders_Sciences_67_records.csv 

[('education', 219), ('student', 199), ('program', 188), ('communication', 174), ('health', 150), ('position', 146), ('experience', 132), ('job', 131), ('disorder', 130), ('science', 127), ('candidate', 125), ('research', 120), ('university', 114), ('employment', 112), ('community', 110), ('faculty', 107), ('ability', 106), ('level', 103), ('care', 102), ('work', 93), ('application', 83), ('service', 81), ('department', 78), ('patient', 72), ('information', 72), ('graduate', 72), ('communication science', 72), ('diversity', 71), ('degree', 71), ('employee', 69), ('school', 67), ('opportunity', 66), ('science disorder', 64), ('development', 64), ('communication science disorder', 64), ('skill', 62), ('status', 61), ('health care', 61), ('team', 59), ('therapy', 57), ('practice', 57), ('resource', 55), ('leadership', 55), ('function', 55), ('applicant', 55), ('yea