# Import Needed Packages

In [17]:
from flask import Flask,  jsonify,request
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import string

In [18]:
original_data_loaded = pd.read_csv('jobs_skills.csv') #original data loaded
original_data_loaded.columns = ['id','title', 'jobFunction', 'industry', 'skills'] # cloumns we have

In [19]:
enhanced_data=pd.read_csv('jobs_skills.csv') #enhanced data will be saved here
enhanced_data.columns = ['id','title', 'jobFunction', 'industry', 'skills'] # cloumns we have

enhanced_data.sample(10)# discover data we have

Unnamed: 0,id,title,jobFunction,industry,skills
65204,5da16610e43fd1a7ab67b6e7,Outdoor Data Collector,['Marketing/PR/Advertising'],['Marketing and Advertising'],"['Public Relations (PR)', 'Data Collecter', 'M..."
13997,5da10693e43fd1a7ab66eee0,Technical Office Civil Engineer,['Engineering - Construction/Civil/Architecture'],['Construction - Residential & Commercial/Offi...,"['Civil Constructions', 'Civil Engineering', '..."
63596,5da162f3e43fd1a7ab67b09f,Front Desk & Admin Assistant,['Administration'],['Real Estate/Property Management'],"['Startup', 'Microsoft Word', 'Front Desk', 'R..."
43104,5da13bbbe43fd1a7ab676093,Science Teacher,"['R&D/Science', 'Education/Teaching']",['Education'],"['Education', 'American Curriculum', 'Teaching..."
95438,5da1a022e43fd1a7ab682cfd,Graphic Design/UI Developer - Internship,"['IT/Software Development', 'Engineering - Tel...",['Information Technology Services'],"['Adobe Flash', 'Adobe Indesign', 'Graphic Des..."
63711,5da1632de43fd1a7ab67b113,Warehouse Keeper,['Logistics/Supply Chain'],['Real Estate/Property Management'],"['Warehousing', 'Logistics', 'Warehouse Keeper']"
33332,5da12983e43fd1a7ab673a66,Sales & Marketing Specialist,"['Marketing/PR/Advertising', 'Sales/Retail', '...","['Internet/E-commerce', 'Marketing and Adverti...","['Online Marketing', 'Customer Care', 'Digital..."
2981,5da0f2c6e43fd1a7ab66c3d8,Nutrition Supervisor,"['Medical/Healthcare', 'Quality']",['Education'],"['Nutrition Supervising', 'Quality Assurance',..."
33424,5da129ade43fd1a7ab673ac1,Precast Manager,['Engineering - Construction/Civil/Architecture'],['Construction - Industrial Facilities and Inf...,"['Budget Planning', 'Budget planning', 'Constr..."
345,5da0ee45e43fd1a7ab66b98a,Personal Assistant,['Administration'],"['Business Supplies and Equipment', 'Real Esta...","['Startup', 'Office Management', 'Administrati..."


# Clean Data

In [20]:
ps = PorterStemmer()
lemma = WordNetLemmatizer()
# excluding 'it' from the stopwords as it has meaning in the jobs the file have like 'IT/Software'
stop= set(stopwords.words('english')) - set(['it'])

In [21]:
#removing special characters
enhanced_data['title']=enhanced_data['title'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['jobFunction']=enhanced_data['jobFunction'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['industry']=enhanced_data['industry'].str.encode('ascii', 'ignore').str.decode('ascii')
enhanced_data['skills']=enhanced_data['skills'].str.encode('ascii', 'ignore').str.decode('ascii')

In [22]:
#tokenization will done here too
#lower case all letters
enhanced_data['title'] = enhanced_data['title'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['jobFunction'] = enhanced_data['jobFunction'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['industry'] = enhanced_data['industry'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()
enhanced_data['skills'] = enhanced_data['skills'].str.lower().str.replace(r'[^\w\s]+', ' ').str.split()

In [23]:
#removing stop words
enhanced_data['title'] = enhanced_data['title'].apply(lambda x: [item for item in x if item not in stop])
enhanced_data['jobFunction'] = enhanced_data['jobFunction'].apply(lambda x: [item for item in x if item not in stop])
enhanced_data['industry'] = enhanced_data['industry'].apply(lambda x: [item for item in x if item not in stop])
enhanced_data['skills'] = enhanced_data['skills'].apply(lambda x: [item for item in x if item not in stop])

In [24]:
#lemmatization , return the base or dictionary from the words , which is know as lema
enhanced_data['title']= enhanced_data['title'].apply(lambda x : [lemma.lemmatize(y) for y in x])
enhanced_data['jobFunction']= enhanced_data['jobFunction'].apply(lambda x : [lemma.lemmatize(y) for y in x])
enhanced_data['industry']= enhanced_data['industry'].apply(lambda x : [lemma.lemmatize(y) for y in x])
enhanced_data['skills']= enhanced_data['skills'].apply(lambda x : [lemma.lemmatize(y) for y in x])

#Stemming , return the roots of the words and replacing the suffix, which is know as stem
#enhanced_data['title']= enhanced_data['title'].apply(lambda x : [ps.stem(y) for y in x])
#enhanced_data['jobFunction']= enhanced_data['jobFunction'].apply(lambda x : [ps.stem(y) for y in x])
#enhanced_data['industry']= enhanced_data['industry'].apply(lambda x : [ps.stem(y) for y in x])
#enhanced_data['skills']= enhanced_data['skills'].apply(lambda x : [ps.stem(y) for y in x])

In [25]:
#concatinate again all the values in each row into one string as [ux, designer] to 'ux designer'
# each row in 'title' column
counter1=0
for item in enhanced_data['title']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['title'][counter1]=""
    enhanced_data['title'][counter1]=s
    counter1=counter1+1  


In [26]:

# each row in 'jobFunction' column
counter2=0
for item in enhanced_data['jobFunction']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['jobFunction'][counter2]=""
    enhanced_data['jobFunction'][counter2]=s
    counter2=counter2+1

In [27]:

    
# each row in 'industry' column
counter3=0
for item in enhanced_data['industry']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['industry'][counter3]=""
    enhanced_data['industry'][counter3]=s
    counter3=counter3+1

In [28]:

# each row in 'skills' column
counter4=0
for item in enhanced_data['skills']:  
    s=""
    for i in item: 
        if i != 'nan':
            s=s+" "+i
        else:
            s=i
    enhanced_data['skills'][counter4]=""
    enhanced_data['skills'][counter4]=s
    counter4=counter4+1

In [29]:
'''-------------------set Data Loaded in DataFrams--------------------'''
# old_original datafram that has the values without any enhancements
old_original_df = pd.DataFrame(original_data_loaded)

# original datafram that has the values without any enhancements
original_df = pd.DataFrame(original_data_loaded)

# old_enhanced datafram that has the values with enhancements
old_enhanced_df = pd.DataFrame(enhanced_data)

# enhanced datafram that has the values with enhancements
enhanced_df = pd.DataFrame(enhanced_data)

# enhanced datafram that has the values with enhancements
play_df = pd.DataFrame(enhanced_data)

'''-----------------Drop Missing Values in Data Fram------------------'''
# drop all rows that has missing values from the data fram original_df
original_df.dropna(axis=0, how='any',inplace=False)

# drop all rows that has missing values from the data fram enhanced_df
enhanced_df.dropna(axis=0, how='any',inplace=False)

# drop all rows that has missing values from the data fram enhanced_df
play_df.dropna(axis=0, how='any',inplace=False)

'''-----------------Get rows that have 'nan' values-------------------'''
# detect all the rows that contain 'nan' value from the enhanced enhanced_df
indexNames = enhanced_df[ (enhanced_df['title'] == 'nan') | (enhanced_df['jobFunction'] == 'nan') | (enhanced_df['industry'] == 'nan') | (enhanced_df['skills'] == 'nan') ].index

# detect all the rows that contain 'nan' value from the enhanced enhanced_df
indexNames_play = play_df[ (play_df['title'] == 'nan') | (play_df['jobFunction'] == 'nan') | (play_df['industry'] == 'nan') | (play_df['skills'] == 'nan') ].index


'''-------------Delete Rows that contains 'nan' values----------------'''
# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the original_df
original_df.drop(indexNames , inplace=True)

# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the enhanced_df
enhanced_df.drop(indexNames , inplace=True)

# delete all rows for which column 'title','jobFunction','industry' has value'nan' from the enhanced_df
play_df.drop(indexNames , inplace=True)

'''------------------Reset Indexies in Data Fram----------------------'''
#reset the index of the original_df
original_df=original_df.reset_index(drop=True)

#reset the index of the enhanced_df
enhanced_df=enhanced_df.reset_index(drop=True)

#reset column 'id' in original_df
original_df['id'] = original_df.index

#reset column 'id' in the enhanced_df
enhanced_df['id'] = enhanced_df.index

#reset the index of the enhanced_df
play_df=play_df.reset_index(drop=True)

#reset column 'id' in the enhanced_df
play_df['id'] = play_df.index

In [30]:
enhanced_df

Unnamed: 0,id,title,jobFunction,industry,skills
0,0,property consultant,sale retail,real estate property management,sale retail real estate sale target indoor sa...
1,1,sale representative real estate,customer service support sale retail,real estate property management,sale real estate sale target customer service...
2,2,receptionist,administration,real estate property management,admin work office management administration a...
3,3,senior property consultant,sale retail,real estate property management,sale real estate sale target computer skill c...
4,4,senior seo specialist,marketing pr advertising medium journalism pu...,real estate property management marketing adv...,marketing campaign e marketing digital market...
...,...,...,...,...,...
99510,99510,application unlimited sale representative levant,it software development sale retail,information technology service computer software,peoplesoft sale computer science outdoor sale...
99511,99511,senior netsuite solution engineer arabic,it software development engineering telecom t...,information technology service computer software,computer science saas erp telecommunication i...
99512,99512,sale representative alexandria,sale retail,education business service training coaching,customer service customer care sale skill sale
99513,99513,english instructor 6th october part time,training instructor education teaching,education,translation linguistics iqp education trainin...


# Build Training Dataset 

In [31]:
from collections import Counter 
  
def remov_duplicates(input): 
  
    # split input string separated by space 
    input = input.split(" ") 
  
    # joins two adjacent elements in iterable way 
    for i in range(0, len(input)): 
        input[i] = "".join(input[i]) 
  
    # now create dictionary using counter method 
    # which will have strings as key and their  
    # frequencies as value 
    UniqW = Counter(input) 
  
    # joins two adjacent elements in iterable way 
    s = " ".join(UniqW.keys()) 
    return s

In [32]:
all_jobtitles =""
for e in enhanced_data['title']:
    all_jobtitles=all_jobtitles+e
    all_jobtitles=all_jobtitles+" "
    
resultString_jobtitles=remov_duplicates(all_jobtitles)

all_jobfunctions =""
for e in enhanced_data['jobFunction']:
    all_jobfunctions=all_jobfunctions+e
    all_jobfunctions=all_jobfunctions+" "
    
resultString_jobfunctions=remov_duplicates(all_jobfunctions)

all_industries =""
for e in enhanced_data['industry']:
    all_industries=all_industries+e
    all_industries=all_industries+" "
    
resultString_industries=remov_duplicates(all_industries)

all_skills =""
for e in enhanced_data['skills']:
    all_skills=all_skills+e
    all_skills=all_skills+" "
    
resultString_skills=remov_duplicates(all_skills)

# Insert data in a data fram

In [33]:
# Calling DataFrame constructor 
new_df = pd.DataFrame() 
new_df["input"] = ""
new_df["skill_or_not"] = ""

data1 = resultString_jobtitles.split(" ") #split string into a list
data2 = resultString_jobfunctions.split(" ") #split string into a list
data3 = resultString_industries.split(" ") #split string into a list
data4 = resultString_skills.split(" ") #split string into a list

for temp in data1:
    new_df = new_df.append({'input': temp, 'skill_or_not': 0}, ignore_index=True)
    
for temp in data2:
    new_df = new_df.append({'input': temp, 'skill_or_not': 0}, ignore_index=True)
    
for temp in data3:
    new_df = new_df.append({'input': temp, 'skill_or_not': 0}, ignore_index=True)
    
for temp in data4:
    new_df = new_df.append({'input': temp, 'skill_or_not': 1}, ignore_index=True)

# Shuffle Training Dataset Rows 

In [34]:
#print("Dataframe Contens ", new_df, sep='\n')
from sklearn.utils import shuffle
new_df = shuffle(new_df)
new_df.reset_index(drop=True)

Unnamed: 0,input,skill_or_not
0,fundamental,1
1,qualitative,1
2,contect,1
3,ic,1
4,tooling,0
...,...,...
4952,fishing,0
4953,automated,1
4954,capa,1
4955,brc,1


# Save the Dataset into csv file for further use

In [35]:
# save the new_df in skills_not_skills1.csv file,so we don't have to reclean the data each time we want to use it again 
new_df.to_csv('skills_not_skills1.csv')

#upload enhanced_data.csv
#new_df = pd.read_csv('skills_not_skills1.csv')

# Start getting skill or not skill >>>>>>>>>>>>>>>>>

# take input user

In [None]:
user_input = input("Enter your skill : ") 
new_df = new_df.append({'input': user_input}, ignore_index=True)

In [None]:
# Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()

# encode inputs and label

In [None]:
input_encoded=le.fit_transform(new_df["input"])
label=le.fit_transform(new_df["skill_or_not"])

#combinig weather and temp into single listof tuples
features=list(zip(input_encoded))

# train knn model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training sets
model.fit(features,label)

# test knn model 

In [None]:
#Predict Output
predicted= model.predict([[input_encoded[-1]]]) # 0:Overcast, 2:Mild
print(predicted)
if (predicted==1):
    print(user_input, " is a skill")
else:
    print(user_input, " is not a skill")