In [3]:
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize
from pandas import DataFrame, ExcelFile, read_csv, read_excel 

In [4]:
def get_taleo_database(excel_path, sheet):
    xls = ExcelFile(excel_path, sep = ';', encoding = 'utf-8')
    df = read_excel(xls, sheet)
    return df

In [5]:
def tokenize_text(text):
    return word_tokenize(text)

In [6]:
def transform_data_to_tfidf(texts, tfidf_model):
    strs = []
    for text in texts:
        stra = tokenize_text(text)
        stra = ' '.join(stra)
        strs.append(stra)
    response = tfidf_model.transform(strs)
    return response

In [7]:
def categorize_items(line, section, section_identifier, tfidf_model):
    items = line.split(' - ')[1:]
    if items:
        items_tfidf = transform_data_to_tfidf(items, tfidf_model)
        labels = [section_identifier.predict(items_tfidf) for item in items]
        return ('$$$%%%&&&'.join([item for item, label in zip(items, labels[0]) if label == section]))
    return None

In [8]:
tfidf_model = joblib.load('tfidf_model.pkl')
section_identifier = joblib.load('Taleo_profile_extractor.pkl')

In [9]:
taleo_db = get_taleo_database('test_Taleo.xlsx', 'All')

In [10]:
output_df = DataFrame()

In [11]:
output_df['Position Title:'] = taleo_db['Requisition Title']
output_df['Department:'] = taleo_db['Job Family']
taleo_db['External: Responsibilities'] = taleo_db['External: Responsibilities'].fillna('')
output_df['responsabilities'] = taleo_db['External: Responsibilities'].apply(lambda x: '$$$%%%&&&'.join(x.strip().split('-')[1:]) if x else None)
output_df['POSITION SUMMARY:'] = taleo_db['Original Description Section - External'] 
output_df['Date Revised:  (Select today’s date)'] = taleo_db['Req. Creation Date']
taleo_db['Qualifications - External'] = taleo_db['Qualifications - External'].fillna('')
output_df['CA'] =  taleo_db['Qualifications - External'].apply(lambda x: categorize_items(x, 'CA', section_identifier, tfidf_model))
output_df['CA_ASSET'] = ''
output_df['CA_REQD'] = ''
output_df['CP'] =  taleo_db['Qualifications - External'].apply(lambda x: categorize_items(x, 'CP', section_identifier, tfidf_model))
output_df['CP_ASSET'] = ''
output_df['CP_REQD'] = ''
output_df['EI'] =  taleo_db['Qualifications - External'].apply(lambda x: categorize_items(x, 'EI', section_identifier, tfidf_model))
output_df['EI_ASSET'] = ''
output_df['EI_REQD'] = ''
output_df['ER'] =  taleo_db['Qualifications - External'].apply(lambda x: categorize_items(x, 'ER', section_identifier, tfidf_model))
output_df['ER_ASSET'] = ''
output_df['ER_REQD'] = ''
output_df['TS'] =  taleo_db['Qualifications - External'].apply(lambda x: categorize_items(x, 'TS', section_identifier, tfidf_model))
output_df['TS_ASSET'] = ''
output_df['TS_REQD'] = ''
output_df['Location (city/site):'] = ''
output_df['Reports To (Position):'] = ''
output_df['Safety-Sensitive:'] = ''
output_df['Title (and #) of Direct Reports:'] = ''
output_df['competency_group'] = ''
output_df['Bending/Crouching:'] = ''
output_df['Climbing:'] = ''
output_df['Driving:'] = ''
output_df['Keyboarding:'] = ''
output_df['Kneeling/Crawling:'] = ''
output_df['On-call:'] = ''
output_df['Operating equipment:'] = ''
output_df['Sedentary/Sitting:'] = ''
output_df['Shift work:'] = ''
output_df['Travel:'] = ''
output_df['Walking:'] = ''
output_df['Manual tools:'] = ''
output_df['Office equipment:'] = ''
output_df['Pneumatic tools:'] = ''
output_df['Power tools:'] = ''
output_df['Shop tools:'] = ''
output_df['Tool belt worn:'] = ''
output_df['Vibration tools:'] = ''
output_df['Welding:'] = ''
output_df[' Extreme heat/cold:'] = ''
output_df['Chemicals:'] = ''
output_df['Confined spaces:'] = ''
output_df['Heights:'] = ''
output_df['Moving equipment:'] = ''
output_df['Night time:'] = ''
output_df['Noise:'] = ''
output_df['Outdoors:'] = ''
output_df['Rotating equipment:'] = ''
output_df['Toxic gases:'] = ''
output_df['Uneven surfaces:'] = ''
output_df['Wet or damp:'] = ''
output_df['Light 11-20 pounds:'] = ''
output_df['Medium 21-50 pounds:'] = ''
output_df['Sedentary 0-10 pounds:'] = ''
output_df['Carrying:'] = ''
output_df['Lifting:'] = ''
output_df['Pushing/pulling:'] = ''
output_df['Standing:'] = ''

  sorted(inconsistent))


In [10]:
output_df.to_csv('raw_database.csv')

In [11]:
output_df.columns

Index(['Position Title:', 'Department:', 'responsabilities',
       'POSITION SUMMARY:', 'Date Revised:  (Select today’s date)', 'CA',
       'CA_ASSET', 'CA_REQD', 'CP', 'CP_ASSET', 'CP_REQD', 'EI', 'EI_ASSET',
       'EI_REQD', 'ER', 'ER_ASSET', 'ER_REQD', 'TS', 'TS_ASSET', 'TS_REQD',
       'Location (city/site):', 'Reports To (Position):', 'Safety-Sensitive:',
       'Title (and #) of Direct Reports:', 'competency_group',
       'Bending/Crouching:', 'Climbing:', 'Driving:', 'Keyboarding:',
       'Kneeling/Crawling:', 'On-call:', 'Operating equipment:',
       'Sedentary/Sitting:', 'Shift work:', 'Travel:', 'Walking:',
       'Manual tools:', 'Office equipment:', 'Pneumatic tools:',
       'Power tools:', 'Shop tools:', 'Tool belt worn:', 'Vibration tools:',
       'Welding:', ' Extreme heat/cold:', 'Chemicals:', 'Confined spaces:',
       'Heights:', 'Moving equipment:', 'Night time:', 'Noise:', 'Outdoors:',
       'Rotating equipment:', 'Toxic gases:', 'Uneven surfaces:',
      

In [1]:
from csvprofiles_struct import jp_to_struct
from pandas import read_excel
taleo_db = read_excel('test_Taleo.xlsx', 'All')

In [2]:

convertor = jp_to_struct()
print(convertor.to_struct(taleo_db))


AttributeError: module '__main__' has no attribute 'tokenize_text'