In [18]:
import re, nltk, spacy, gensim, os
from bs4 import BeautifulSoup
from nltk.tokenize import ToktokTokenizer
from nltk.stem import wordnet
from nltk.corpus import stopwords
from string import punctuation
import pandas as pd
import emoji
import re
from nltk import ngrams
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer


In [19]:
def list_directories(path):
    """list files and directories in a given path"""
    arr = os.listdir(path)
    return arr

In [111]:
top_tags = []

def list_directories(path):
    """list files and directories in a given path"""
    arr = os.listdir(path)
    return arr


def clean_text(text):
    ''' Lowering text and removing undesirable marks
    '''
    
    text = text.lower()
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text) # matches all whitespace characters
    text = text.strip(' ')
    return text
    

token = ToktokTokenizer()
punct = punctuation
    
def strip_list_noempty(mylist):
    
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']
    
    
def clean_punct(text): 
    ''' Remove punctuations'''
    
    words = token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    
    for w in words:
        if w in top_tags:
            punctuation_filtered.append(w)
        else:
            w = re.sub('^[0-9]*', " ", w)
            punctuation_filtered.append(regex.sub('', w))
  
    filtered_list = strip_list_noempty(punctuation_filtered)
        
    return ' '.join(map(str, filtered_list))


stop_words = set(stopwords.words("english"))

def stopWordsRemove(text):
    ''' Removing all the english stop words from a corpus
    Parameter:
    text: corpus to remove stop words from it
    '''

    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    
    return ' '.join(map(str, filtered))
    
    
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts, allowed_postags, stop_words=stop_words):
    ''' It keeps the lemma of the words (lemma is the uninflected form of a word),
    and deletes the underired POS tags
    
    Parameters:
    
    texts (list): text to lemmatize
    allowed_postags (list): list of allowed postags, like NOUN, ADL, VERB, ADV
    '''
    lemma = wordnet.WordNetLemmatizer()       
    doc = nlp(texts) 
    texts_out = []
    
    for token in doc:
        
        if str(token) in top_tags:
            texts_out.append(str(token))
            
        elif token.pos_ in allowed_postags:
            
            if token.lemma_ not in ['-PRON-']:
                texts_out.append(token.lemma_)
                
            else:
                texts_out.append('')
     
    texts_out = ' '.join(texts_out)

    return texts_out
    
    
def strip_emoji(text):
#     print(emoji.emoji_count(text))
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text



def preprocess_text(df,column='description'):
    
    df[column] = df[column].apply(lambda x: clean_text(x))
    df[column] = df[column].apply(lambda x:  BeautifulSoup(x).get_text())
    df[column] = df[column].apply(lambda x: strip_emoji(x))

    df[column] = df[column].apply(lambda x: clean_punct(x)) 
    df[column] = df[column].apply(lambda x: stopWordsRemove(x)) 
    df[column] = df[column].apply(lambda x: lemmatization(x, ['NOUN', 'ADV']))

    return df

def generate_tags(df,column='description'):
    
    vectorizer = TfidfVectorizer(ngram_range=(1,5))
    X = vectorizer.fit_transform(df[column].str.lower())
    X = X.T.toarray()    
    df = pd.DataFrame(X, index=vectorizer.get_feature_names())

    return [df,vectorizer]


def get_sim(df,query,vectorizer,temp):
    data = []
    q = query;q = [q]
    query_vector = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}    
    for i in range(15):
        
        if  np.linalg.norm(df.loc[:, i]) * np.linalg.norm(query_vector) == 0.0:
            pass
        else:
            sim[i] = np.dot(df.loc[:, i].values, query_vector) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(query_vector)
    
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    for k, v in sim_sorted:
        if v > 0.0:
            data.append([k,v])
#             print("sim:", v)
    return data
    

In [112]:
files = list_directories('data')
category = []
for i in files:
    t = ''.join(i.split('_')[:-2])
    category.append(t)


In [121]:
query = 'engineering'
print('############# query: ',query,'\n')

for cat, file in zip(category,files):
    df = pd.read_csv(f"data/{file}") 
    df = preprocess_text(df,column='job-title')
    tags_df,vectorizer = generate_tags(df,column='job-title')
    
    temp = tags_df.copy(); temp.reset_index(inplace=True);temp = temp[['index']]
        
    result = get_sim(tags_df,query,vectorizer,temp)
    if len(result) != 0:
        
        print('matched Folder: ',file )



############# query:  engineering 

matched Folder:  Neuroscience_170_records.csv
matched Folder:  Computer_Science_71_records.csv
matched Folder:  Materials_Engineering_&_Materials_Science_170_records.csv
matched Folder:  Mechanical_Engineering_175_records.csv
matched Folder:  Computer_Engineering_69_records.csv
matched Folder:  Management_Information_Systems_24_records.csv


In [120]:
query = 'science'
print('############# query: ',query,'\n')

for cat, file in zip(category,files):
    df = pd.read_csv(f"data/{file}") 
    df = preprocess_text(df,column='job-title')
    tags_df,vectorizer = generate_tags(df,column='job-title')
    
    temp = tags_df.copy(); temp.reset_index(inplace=True);temp = temp[['index']]
        
    result = get_sim(tags_df,query,vectorizer,temp)
    if len(result) != 0:
        
        print('matched Folder: ',file )



############# query:  science 

matched Folder:  Communication_Disorders_Sciences_67_records.csv
matched Folder:  Computer_Science_71_records.csv
matched Folder:  Mathematics_173_records.csv
matched Folder:  Materials_Engineering_&_Materials_Science_170_records.csv
matched Folder:  Computer___Information_Systems_69_records.csv
