In [97]:
# importing the required libraries
import pandas as pd
from nltk.corpus import words
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.fasttext import FastText
from sklearn.metrics import confusion_matrix, accuracy_score

In [98]:
# importing the dataset 
data = pd.read_csv('jobs_skills.csv')
# Skills Cleaning and splitting 
stop = stopwords.words('english')
skills=pd.DataFrame(data['skills'])
data=[]
skills['skills']= skills['skills'].apply(lambda text: text.split(','))
skills['skills']= skills['skills'].apply(lambda text: [re.sub(r'[^\w\s]','', word).replace('  ',' ').strip().lower() 
                                                      for word in text])
skills['skills']= skills['skills'].apply(lambda text: [" ".join([word.lower() 
                                                     for word in sen.split() if word.lower().strip() not in stop]) 
                                                    for sen in text] )


In [99]:
# Obtaining the unique list of skills
unique_skills=[]
for i in range(len(skills)):
    for skill in skills['skills'][i]:
        if skill not in unique_skills:
            #removing the characters and the 'nan' skills
            if ( (len(skill) > 1 ) and (skill != 'nan' )):
               unique_skills.append(skill)

In [100]:
# Creating the list of not skills words using nltk
nonskills_words= words.words()
unique_nonskills = []
for i in range(len(nonskills_words)):
     #removing the characters and the stop words
    if ((len(nonskills_words[i]) > 1) and (nonskills_words[i] not in stop)):
        unique_nonskills.append(nonskills_words[i].lower())
nonskills_words =[]
# removing the words which in the unique list of skills
unique_nonskills = list(set(unique_nonskills).difference(set(unique_skills)))
# taking only the size of the unique list of skills
unique_nonskills = unique_nonskills[:len(unique_skills)]

In [101]:
# Total list (skills and not skills)
total_dataset = unique_skills + unique_nonskills
# The output corresponding to the total list
Output_vector = [1] * len(unique_skills) + [0] * len(unique_nonskills)

In [37]:
unique_nonskills=[]
unique_skills=[]
stop=[]

In [102]:

## i ran this part on colab due to memory issue and saved the model
# Fitting the FastText model to our skills corpus
embedding_size = 60
window_size = 40
min_word = 1
down_sampling = 1e-2
ft_model_not = FastText(size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

# Building our Vocabulary before training
ft_model_not.build_vocab(sentences=total_dataset)

# Training the FastText model
ft_model_not.train(sentences=total_dataset, total_examples=len(total_dataset), epochs=10) 
# Saving the FastText Model:
ft_model_not.save("fasttext.model")

In [154]:
# Loading the fasttext model to vectorize the dataset
ft_model = FastText.load("fasttext2.model")
# Vectorization
dataset_vectorized=[]
for skill_word in total_dataset:
    skill_vect =  list(ft_model.wv[skill_word]) 
    dataset_vectorized.append(skill_vect) 

In [175]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(dataset_vectorized, Output_vector, test_size = 0.1, random_state = 0)

In [206]:
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import GaussianNB
Classifier_object_knn = GaussianNB()
Classifier_object_knn.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [207]:
# Fitting the random forest classifier object to the training set
from sklearn.ensemble import RandomForestClassifier
Classifier_object_random_forest = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)
Classifier_object_random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [208]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
Classifier_object_logistic_regression = LogisticRegression(random_state = 0)
Classifier_object_logistic_regression.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [209]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
Classifier_object_svm = SVC(kernel = 'linear', random_state = 0)
Classifier_object_svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [210]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
Classifier_object_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
Classifier_object_dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [246]:
total_dataset

['sales',
 'retail',
 'real estate',
 'sales target',
 'indoor sales',
 'sales skills',
 'property sales',
 'customer service',
 'customer care',
 'admin work',
 'office management',
 'administration',
 'admin',
 'secretary',
 'microsoft office',
 'computer skills',
 'communication skills',
 'microsoft excel',
 'marketing campaigns',
 'emarketing',
 'digital marketing',
 'sem',
 'seo content',
 'social media',
 'marketing',
 'media',
 'market research',
 'seo',
 'customer support',
 'property',
 'telesales',
 'real estate sales',
 'property consultant',
 'sports',
 'entertainment',
 'sports animation',
 'hotels',
 'biling',
 'organizing',
 'printing',
 'scanning',
 'copying',
 'document control',
 'analysis',
 'business administration',
 'reporting packages',
 'data collection',
 'business analysis',
 'public relations pr',
 'presentation skills',
 'pr',
 'personnel',
 'recruitment',
 'human resources hr',
 'adobe photoshop',
 'accounting',
 'financial analysis',
 'financial management

In [259]:
dataset_vectorized_test=[]
skill_vect_test =  list(ft_model.wv["recruitment"])
dataset_vectorized_test.append(skill_vect_test) 

# Predicting the Test set results
y_pred_testt1 = Classifier_object_knn.predict(dataset_vectorized_test)

# Predicting the Test set results
y_pred_testt2 = Classifier_object_random_forest.predict(dataset_vectorized_test)

# Predicting the Test set results
y_pred_testt3 = Classifier_object_logistic_regression.predict(dataset_vectorized_test)

# Predicting the Test set results
y_pred_testt4 = Classifier_object_svm.predict(dataset_vectorized_test)

# Predicting the Test set results
y_pred_testt5 = Classifier_object_dt.predict(dataset_vectorized_test)


print(y_pred_testt1)
print(y_pred_testt2)
print(y_pred_testt3)
print(y_pred_testt4)
print(y_pred_testt5)

[0]
[1]
[1]
[1]
[1]


In [123]:
# Fitting Naive Bayes classifier to the Training set
from sklearn.naive_bayes import GaussianNB
Classifier_object = GaussianNB()
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)

# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     


the Confusion Matrix:
[[446 143]
 [402 171]]
the Accuracy:
0.5309810671256454


In [124]:
# Fitting K-NN classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
Classifier_object = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)

# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     

the Confusion Matrix:
[[ 79 510]
 [ 71 502]]
the Accuracy:
0.5


In [161]:
# Fitting the random forest classifier object to the training set
from sklearn.ensemble import RandomForestClassifier
Classifier_object = RandomForestClassifier(n_estimators = 300, criterion = 'entropy', random_state = 0)
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)

# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     


the Confusion Matrix:
[[475 114]
 [220 353]]
the Accuracy:
0.7125645438898451


In [184]:
# the following models gave better accuracy after feature scaling
# Feature scaling ( standard scaling)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [165]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
Classifier_object = LogisticRegression(random_state = 0)
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)

# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     

the Confusion Matrix:
[[343 246]
 [224 349]]
the Accuracy:
0.5955249569707401


In [166]:
# Fitting SVM to the Training set
from sklearn.svm import SVC
Classifier_object = SVC(kernel = 'linear', random_state = 0)
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)
# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     

the Confusion Matrix:
[[344 245]
 [231 342]]
the Accuracy:
0.5903614457831325


In [167]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
Classifier_object = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
Classifier_object.fit(X_train, y_train)

# Predicting the Test set results
y_pred = Classifier_object.predict(X_test)
# Making the Confusion Matrix and accuracy
cm = confusion_matrix(y_test, y_pred)
Accuracy=accuracy_score(y_test, y_pred)
print('the Confusion Matrix:')
print(cm)
print('the Accuracy:')
print(Accuracy)     

the Confusion Matrix:
[[347 242]
 [238 335]]
the Accuracy:
0.5869191049913941


### So we can conclude that the best models were Random forest then logistic regression and the worest model was K-nearst neighbours