# Importing Packages

In [1]:
from flask import Flask,  jsonify,request
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import string

In [2]:
app = Flask(__name__)

# Uploading jobs_data csvfile

In [3]:
original_data_loaded = pd.read_csv('jobs_data.csv') #original data loaded
original_data_loaded.columns = ['id','title', 'jobFunction', 'industry'] # cloumns we have

enhanced_data=pd.read_csv('jobs_data.csv') #enhanced data will be saved here
enhanced_data.columns = ['id','title', 'jobFunction', 'industry'] # cloumns we have

enhanced_data.sample(10)# discover data we have

Unnamed: 0,id,title,jobFunction,industry
1976,1976,Software Business Analyst,"['Business Development', 'Analyst/Research', '...","['Information Technology Services', 'Computer ..."
9319,9319,Senior Digital Marketing Specialist,"['Marketing/PR/Advertising', 'Media/Journalism...","['Market Research', 'Real Estate/Property Mana..."
358,358,IT Administrator,"['Administration', 'IT/Software Development']","['Information Technology Services', 'Computer ..."
2494,2494,Senior iOS Developer,"['IT/Software Development', 'Engineering - Tel...","['Information Technology Services', 'Computer ..."
7973,7973,Senior Move2Root Engineer,"['IT/Software Development', 'Engineering - Tel...","['Information Technology Services', 'Telecommu..."
3737,3737,Sales Executive (Advertising),['Sales/Retail'],"['Marketing and Advertising', 'Public Relation..."
6701,6701,Marketing GE Internship - India,"['Marketing/PR/Advertising', 'Media/Journalism...","['Information Technology Services', 'Non-Profi..."
6284,6284,Receptionist - AMC,"['Administration', 'Medical/Healthcare']",['Healthcare and Medical Services']
466,466,English Co-Teacher/Heliopolis,['Education/Teaching'],"['Education', 'Youth and Volunteering']"
2171,2171,Digital Marketing Executive,"['Media/Journalism/Publishing', 'Writing/Edito...","['Market Research', 'Writing and Editing', 'Ma..."


# Pre processing

In [4]:
ps = PorterStemmer()
lemma = WordNetLemmatizer()
# excluding 'it' from the stopwords as it has meaning in the jobs the file have like 'IT/Software'
stop= set(stopwords.words('english')) - set(['it'])

#removing special characters
enhanced_data['title']=enhanced_data['title'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['jobFunction']=enhanced_data['jobFunction'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['industry']=enhanced_data['industry'].str.encode('ascii', 'ignore').str.decode('ascii')

#tokenization will done here too
#lower case all letters
enhanced_data['title'] = enhanced_data['title'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['jobFunction'] = enhanced_data['jobFunction'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['industry'] = enhanced_data['industry'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()

#removing stop words
enhanced_data['title'] = enhanced_data['title'].apply(lambda x: [item for item in x if item not in stop])
enhanced_data['jobFunction'] = enhanced_data['jobFunction'].apply(lambda x: [item for item in x if item not in stop])
enhanced_data['industry'] = enhanced_data['industry'].apply(lambda x: [item for item in x if item not in stop])

#lemmatization , return the base or dictionary from the words , which is know as lema
enhanced_data['title']= enhanced_data['title'].apply(lambda x : [lemma.lemmatize(y) for y in x])
enhanced_data['jobFunction']= enhanced_data['jobFunction'].apply(lambda x : [lemma.lemmatize(y) for y in x])
enhanced_data['industry']= enhanced_data['industry'].apply(lambda x : [lemma.lemmatize(y) for y in x])

#Stemming , return the roots of the words and replacing the suffix, which is know as stem
enhanced_data['title']= enhanced_data['title'].apply(lambda x : [ps.stem(y) for y in x])
enhanced_data['jobFunction']= enhanced_data['jobFunction'].apply(lambda x : [ps.stem(y) for y in x])
enhanced_data['industry']= enhanced_data['industry'].apply(lambda x : [ps.stem(y) for y in x])



#concatinate again all the values in each row into one string as [ux, designer] to 'ux designer'
# each row in 'title' column
counter1=0
for item in enhanced_data['title']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['title'][counter1]=""
    enhanced_data['title'][counter1]=s
    counter1=counter1+1  

# each row in 'jobFunction' column
counter2=0
for item in enhanced_data['jobFunction']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['jobFunction'][counter2]=""
    enhanced_data['jobFunction'][counter2]=s
    counter2=counter2+1
    
# each row in 'industry' column
counter3=0
for item in enhanced_data['industry']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['industry'][counter3]=""
    enhanced_data['industry'][counter3]=s
    counter3=counter3+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

# Save Enhanced data in CSV file

In [5]:
# save the enhanced_data in enhanced_data.csv file,so we don't have to reclean the data each time we want to use it again 
enhanced_data.to_csv('enhanced_data.csv')
#upload enhanced_data.csv
#enhanced_data = pd.read_csv('enhanced_data.csv')

In [6]:
enhanced_data

Unnamed: 0,id,title,jobFunction,industry
0,0,full stack php develop,engin telecom technolog it softwar develop,comput softwar market advertis
1,1,cisco collabor specialist engin,instal mainten repair it softwar develop engi...,inform technolog servic
2,2,senior back end php develop,engin telecom technolog it softwar develop,comput softwar comput network
3,3,ux design,creativ design art it softwar develop,comput softwar inform technolog servic comput...
4,4,java technic lead,engin telecom technolog it softwar develop,comput softwar inform technolog servic
...,...,...,...,...
10865,10865,odoo develop,it softwar develop engin telecom technolog,real estat properti manag manufactur
10866,10866,senior php develop alexandria,engin telecom technolog it softwar develop,inform technolog servic
10867,10867,sale account manag real estat medic assiut,busi develop sale retail custom servic support,real estat properti manag retail healthcar me...
10868,10868,technic network support engin alexandria,instal mainten repair engin telecom technolog...,comput softwar educ healthcar medic servic


# Dropping all rows that have missing value or 'nan' values

In [7]:
'''-------------------set Data Loaded in DataFrams--------------------'''
# old_original datafram that has the values without any enhancements
old_original_df = pd.DataFrame(original_data_loaded)

# original datafram that has the values without any enhancements
original_df = pd.DataFrame(original_data_loaded)

# old_enhanced datafram that has the values with enhancements
old_enhanced_df = pd.DataFrame(enhanced_data)

# enhanced datafram that has the values with enhancements
enhanced_df = pd.DataFrame(enhanced_data)

# enhanced datafram that has the values with enhancements
play_df = pd.DataFrame(enhanced_data)


'''-----------------Drop Missing Values in Data Fram------------------'''
# drop all rows that has missing values from the data fram original_df
original_df.dropna(axis=0, how='any',inplace=False)

# drop all rows that has missing values from the data fram enhanced_df
enhanced_df.dropna(axis=0, how='any',inplace=False)

# drop all rows that has missing values from the data fram enhanced_df
play_df.dropna(axis=0, how='any',inplace=False)

'''-----------------Get rows that have 'nan' values-------------------'''
# detect all the rows that contain 'nan' value from the enhanced enhanced_df
indexNames = enhanced_df[ (enhanced_df['title'] == 'nan') | (enhanced_df['jobFunction'] == 'nan') | (enhanced_df['industry'] == 'nan') ].index

# detect all the rows that contain 'nan' value from the enhanced enhanced_df
indexNames_play = play_df[ (play_df['title'] == 'nan') | (play_df['jobFunction'] == 'nan') | (play_df['industry'] == 'nan') ].index


'''-------------Delete Rows that contains 'nan' values----------------'''
# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the original_df
original_df.drop(indexNames , inplace=True)

# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the enhanced_df
enhanced_df.drop(indexNames , inplace=True)

# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the enhanced_df
play_df.drop(indexNames , inplace=True)


'''------------------Reset Indexies in Data Fram----------------------'''
#reset the index of the original_df
original_df=original_df.reset_index(drop=True)

#reset the index of the enhanced_df
enhanced_df=enhanced_df.reset_index(drop=True)

#reset column 'id' in original_df
original_df['id'] = original_df.index

#reset column 'id' in the enhanced_df
enhanced_df['id'] = enhanced_df.index

#reset the index of the enhanced_df
play_df=play_df.reset_index(drop=True)

#reset column 'id' in the enhanced_df
play_df['id'] = play_df.index

#a quick check that the data is okay an having no missing values or 'nan' now to work with it 
enhanced_df.sample(10)

Unnamed: 0,id,title,jobFunction,industry
2614,2614,back end develop laravel,creativ design art engin telecom technolog it...,market advertis internet e commerc comput sof...
9247,9247,senior creativ copywrit,write editori,health well fit inform technolog servic marke...
2114,2114,system engin,instal mainten repair engin telecom technolog...,inform technolog servic comput network secur
10113,10113,cloud autom engin,it softwar develop engin telecom technolog,inform technolog servic
2744,2744,senior web develop,engin telecom technolog it softwar develop,inform technolog servic comput softwar
8227,8227,patient account,account financ,healthcar medic servic
6656,6656,senior react nativ develop,it softwar develop engin telecom technolog,graphic design comput softwar market advertis
6749,6749,electr engin,engin mechan electr,engin servic consum electron electron semicon...
2058,2058,graphic design saudi arabia,creativ design art,photographi market advertis graphic design
272,272,back end develop internship php laravel project,it softwar develop engin telecom technolog,comput softwar


In [8]:
play_df

Unnamed: 0,id,title,jobFunction,industry
0,0,full stack php develop,engin telecom technolog it softwar develop,comput softwar market advertis
1,1,cisco collabor specialist engin,instal mainten repair it softwar develop engi...,inform technolog servic
2,2,senior back end php develop,engin telecom technolog it softwar develop,comput softwar comput network
3,3,ux design,creativ design art it softwar develop,comput softwar inform technolog servic comput...
4,4,java technic lead,engin telecom technolog it softwar develop,comput softwar inform technolog servic
...,...,...,...,...
10748,10748,odoo develop,it softwar develop engin telecom technolog,real estat properti manag manufactur
10749,10749,senior php develop alexandria,engin telecom technolog it softwar develop,inform technolog servic
10750,10750,sale account manag real estat medic assiut,busi develop sale retail custom servic support,real estat properti manag retail healthcar me...
10751,10751,technic network support engin alexandria,instal mainten repair engin telecom technolog...,comput softwar educ healthcar medic servic


# Now we have our Data Loaded Tokenized, Stemmed, Lemmatizied, Lowercased, no Special Characters no Missing Values, no 'nan' Values


# Now we are ready to use our data...

# 1- Measure similarity between each JobTitle tf-idf and its JobFunction tf-idf

In [9]:
play_df['expected'] = 0 #adding a new column holds expected values
threshold= 0.05 # setting threshold 

'''
for loop to iterate on each job title and job function , 
and in each iteration we get the tf-ifd matrix of them and get cosine similarity
'''
for i in play_df['id']:
    
    '''just prepare the data fram we will work with-------------------------------------------------'''
    ds=play_df.iloc[i,1:3]
    ds=ds.reset_index(drop=True)
    str1=ds[0]
    str2=ds[1]
    corpus=list()
    corpus.append(str1)
    corpus.append(str2)
    corpus

    ds = pd.DataFrame(corpus)
    ds=ds.reset_index(drop=True)


    #reset column 'id' in the new data fram ds
    ds['id'] = ds.index
    ds.columns = ['description','id']
    
    ''' Get tf-idf matrix for all the unique words in both two documents then calculate the cosine similarity---------
    1st document : Job Title 
    2nd document : job Functions 
    '''
    
    tf = TfidfVectorizer()
    tfidf_matrix = tf.fit_transform(ds['description'])
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
    results = {}
    for idx, row in ds.iterrows():
        similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
        similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices] 
        results[row['id']] = similar_items[1:]

    def item(id):  
      return ds.loc[ds['id'] == id]['description'].tolist()[0] # Just reads the results out of the dictionary.

    def rd(item_id, num):
        return results[item_id][0][0]

    expected_value= 0.0
    expected_value=rd(item_id=0, num=1) 
    
    '''Get expected Value ------------------------------------------------------------------------------------'''
    
    if (expected_value>threshold):
        play_df['expected'][i]=1 #if job title is similar to its job function set value = 1
    else:
        play_df['expected'][i]=0 #else job title is not similar to its job function set value = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
play_df

Unnamed: 0,id,title,jobFunction,industry,expected
0,0,full stack php develop,engin telecom technolog it softwar develop,comput softwar market advertis,1
1,1,cisco collabor specialist engin,instal mainten repair it softwar develop engi...,inform technolog servic,1
2,2,senior back end php develop,engin telecom technolog it softwar develop,comput softwar comput network,1
3,3,ux design,creativ design art it softwar develop,comput softwar inform technolog servic comput...,1
4,4,java technic lead,engin telecom technolog it softwar develop,comput softwar inform technolog servic,0
...,...,...,...,...,...
10748,10748,odoo develop,it softwar develop engin telecom technolog,real estat properti manag manufactur,1
10749,10749,senior php develop alexandria,engin telecom technolog it softwar develop,inform technolog servic,1
10750,10750,sale account manag real estat medic assiut,busi develop sale retail custom servic support,real estat properti manag retail healthcar me...,1
10751,10751,technic network support engin alexandria,instal mainten repair engin telecom technolog...,comput softwar educ healthcar medic servic,1


# 2- Measure similarity between each JobTitle tf-idf and the rest JobTitles tf-idf

In [11]:
'''
measure similarity between each job title and rest of job titles 
and return the recommended jobs based on the similarity between user job title and job titles we have in our system
'''
#Get Tf-idf for all unique words we have in all documents
#each title will be trated as a document and each document contains some words

# settings that you use for count vectorizer will go here
tf=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_matrix_for_job_title=tf.fit_transform(enhanced_df['title'])
cosine_similarities_for_job_titles = linear_kernel(tfidf_matrix_for_job_title, tfidf_matrix_for_job_title) 
results_job_titles = {}
for idx, row in enhanced_df.iterrows():
    similar_indices_for_job_titles = cosine_similarities_for_job_titles[idx].argsort()[:-100:-1] 
    similar_items_for_job_titles = [(cosine_similarities_for_job_titles[idx][i], enhanced_df['id'][i]) for i in similar_indices_for_job_titles] 
    results_job_titles[row['id']] = similar_items_for_job_titles[1:]

def get_title_from_original_original_df(id):  
  return original_df.loc[original_df['id'] == id]['title'].tolist()[0] # Just reads the results out of the dictionary.

def get_jobFunction_from_original_original_df(id):  
  return original_df.loc[original_df['id'] == id]['jobFunction'].tolist()[0] # Just reads the results out of the dictionary.

In [12]:
#results_job_titles[0][1][1]

# 3-Start Recommendation to the top 10 similar Job Functions

In [None]:
'''-----------------------  START RECCOMMEND  ------------------------'''
@app.route('/', methods=['POST','GET'])
def recommend():
    if request.method=='POST':
        
        num=10 # the top 10 recommended Job Functions to User based on his Job Title we can change it when ever we want
        g = request.form.get('job')
        
        validation_list=play_df[original_df['title'] == g].index
        if len(validation_list) == 0: 
            
            return '''
            <h1> Content-Based Recommendation System </h1>
            <h3> The Syestem Does Not Have Such a job Title </h3>
            '''
        else:
            index_of_row = original_df[ (original_df['title'] == g)].index

            item_id=index_of_row[0]
            list_of_id=list()
            count=0
            print("") 
            print("=========================================================================================") 
            print("Recommending " + str(num) + " jobs similar to " + get_title_from_original_original_df(item_id) + "...")     

            recommended_jobs_list=list()
            
            list_of_all_predicted_Results = results_job_titles[item_id] 
            recs = results_job_titles[item_id][:num]  
            
            for rec in recs: 

                print("Recommended: " +  get_jobFunction_from_original_original_df(rec[1]))
                print("(score: " + str(rec[0]) + ")")
                recommended_jobs_list.append(get_jobFunction_from_original_original_df(rec[1]))
                count=count+1

            case_list = []
            for entry in recommended_jobs_list:
                case = {'job': entry }
                case_list.append(case)

            '''-----------------------  Get Predicted Values  ------------------------'''
            play_df['predicted'] = 0

            for result in list_of_all_predicted_Results:#list of results of similarity scores
                play_df['predicted'][result[1]]=1  

            p_list=list()
            E_list=list()
            for result in list_of_all_predicted_Results:
                p_list.append(play_df['predicted'][result[1]])
                E_list.append(play_df['expected'][result[1]])
            
            print("")
            
            print(" predicted")
            print(p_list)

            print(" expected")
            print(E_list)
            print("")

            '''---------------  Evaluate Our Model ----------------------'''
            
            ''' Get the error score   --------------'''
            #root mean square error of our model
            mse = mean_squared_error(E_list, p_list)
            rmse = sqrt(mse)
            print('RMSE: %f' % rmse) #error
            print("")
            
            ''' Get the Accuracy of our model ----------------'''
            
            # Calculate accuracy percentage between two lists
            def accuracy_metric(actual, predicted):
                correct = 0
                for i in range(len(actual)):
                    if actual[i] == predicted[i]:
                        correct += 1
                return correct / float(len(actual)) * 100.0
            
            accuracy = accuracy_metric(E_list, p_list)
            print('Accuracy: %f' % accuracy) #accuracy


            return '''<h1> Content-Based Recommendation System </h1>
                  
                  <h3> Tob 10 recommended jobs for you... </h3>
                  <br>
                  <h4> job 1 :{} </h4>
                  <h4> job 2 :{} </h4>
                  <h4> job 3 :{} </h4>
                  <h4> job 4 :{} </h4>
                  <h4> job 5 :{} </h4>
                  <h4> job 6 :{} </h4>
                  <h4> job 7 :{} </h4>
                  <h4> job 8 :{} </h4>
                  <h4> job 9 :{} </h4>
                  <h4> job 10 :{} </h4>

                  '''.format(case_list[0]['job'],case_list[1]['job'],case_list[2]['job'],case_list[3]['job'],case_list[4]['job'],case_list[5]['job'],case_list[6]['job'],case_list[7]['job'],case_list[8]['job'],case_list[9]['job'])
    
    return  '''
            <form method="POST">
            Job Title <input type='text' name='job'>
            <input type="submit" value="Recommend jobs for me" >
            </form>
            '''


if __name__ == '__main__':
    app.run(port=8080)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8080/ (Press CTRL+C to quit)
127.0.0.1 - - [02/Jan/2020 04:32:16] "GET / HTTP/1.1" 200 -
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
127.0.0.1 - - [02/Jan/2020 04:32:19] "POST / HTTP/1.1" 200 -



Recommending 10 jobs similar to UX Designer...
Recommended: ['Creative/Design/Art', 'IT/Software Development']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Creative/Design/Art']
(score: 0.9999999999999999)
Recommended: ['Creative/Design/Art', 'IT/Software Development']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Creative/Design/Art']
(score: 0.9999999999999999)
Recommended: ['Creative/Design/Art', 'IT/Software Development']
(score: 0.9999999999999999)
Recommended: ['Engineering - Telecom/Technology', 'Creative/Design/Art', 'IT/Software Development']
(score: 0.9210508902483214)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology', 'Creative/Design/Art']
(score: 0.7878785662067227)
Recommended: ['Creative/Design/Art', 'IT/Software Development']
(score: 0.7878785662067227)
Recommended: ['Creative/Design/Art']
(score: 0.7878785662067227)
Recommended: ['Creative/Design/Art', 'IT/Software Development']
(score: 0.78787

127.0.0.1 - - [02/Jan/2020 04:32:25] "GET / HTTP/1.1" 200 -
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
127.0.0.1 - - [02/Jan/2020 04:32:30] "POST / HTTP/1.1" 200 -



Recommending 10 jobs similar to PHP Full-Stack - Joomla Expert...
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.9999999999999999)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.9999999999999999)
Recommended: ['Engineering - Telecom/Technology', 'IT/Software Development']
(score: 0.5698022936812447)
Recommended: ['Engineering - Telecom/Technology', 'IT/Software Development']
(score: 0.5698022936812447)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.5698022936812447)
Recommended: ['IT/Software Development', 'Engineering - Telecom/Technology']
(score: 0.569802