In [43]:
# Import des libraries classique (numpy, pandas, ...)
import pandas as pd
import numpy as np
import re
import sklearn as sk
import seaborn as sb
from matplotlib import pyplot as plt

from sklearn.impute import SimpleImputer

import collections

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostClassifier, Pool

import streamlit as st


plt.style.use('ggplot')

In [2]:
# Import du dataframe "data.csv"
data = pd.read_csv('dataset_train_test.csv', delimiter=';')
df = data.copy()

In [3]:
df['Experience'] = [float(str(i).replace(",", ".")) for i in df['Experience']]

In [4]:
df['Diplome'].replace(['Master','MSc','MASTER','master','msc','Mastere'],'Master', inplace=True)

In [5]:
df['Diplome'].replace(['Bachelor','BSc','bachelor'],'Bachelor', inplace=True)

In [6]:
df['Diplome'].replace(['PhD','phd','Phd'],'PhD', inplace=True)

In [7]:
df['Diplome'].replace(['No diploma','NO'],'No diploma', inplace=True)

In [8]:
df['Diplome'].unique()

array(['Master', 'PhD', 'Bachelor', 'No diploma'], dtype=object)

In [9]:
imputerMedian = SimpleImputer(strategy='mean', missing_values=np.nan)
def imputationMedian(job):
    df.loc[df['Metier'] == job, 'Experience'] = imputerMedian.fit_transform(df.loc[df['Metier'] == job][['Experience']])
    return df

In [10]:
for job in df['Metier'].unique(): 
    imputationMedian(job)

In [11]:
df['Exp_label'] = df['Experience'].apply(lambda x: 'debutant' if x<2 else ('Confirme' if x>=2 and x<5 else ('Avance' if x>=5 and x<8 else 'Expert' )))

In [12]:
df_new = df.copy()

In [13]:
technos = df_new['Technologies'].str.split('/', expand=True)
technos

Unnamed: 0,0,1,2,3,4,5,6
0,Matlab,Python,Pyspark,Scikit-learn,Tensorflow,,
1,Python,Java,Scala,MongoDB,,,
2,SPSS,SQL,Teradata,R,Python,Tensorflow,scikit-learn
3,C,C++,Java,Python,,,
4,Matlab,Python,C++,numpy,Tensorflow,scikit-learn,
...,...,...,...,...,...,...,...
9557,C++,R,Python,Matlab,Matplotlib,,
9558,Excel,Python,Matlab,R,machine learning,anglais,
9559,R,machine learning,Excel,VBA,,,
9560,Python,Pyspark,machine learning,Microsoft Azure,,,


In [14]:
df_new[['technos1', 'technos2','technos3','technos4','technos5','technos6','technos7' ]] = df_new['Technologies'].str.split('/', expand=True)

In [15]:
df_new = df_new.drop(['Technologies'], axis=1)

In [16]:
technos_array = technos.stack().tolist()

In [17]:
sorted_technos = sorted (technos_array, key=technos_array.count, reverse =True)

In [18]:
output = []
for x in sorted_technos:
    if x not in output:
        output.append(x)
print(len(output))
output

61


['Python',
 'R',
 'SQL',
 'Java',
 'Hadoop',
 'Excel',
 'C++',
 'Spark',
 'Linux',
 'MongoDB',
 'VBA',
 'machine learning',
 'SAS',
 'Docker',
 'Matlab',
 'Hive',
 'Tensorflow',
 'Elasticsearch',
 'Big data',
 'C',
 'Kibana',
 'AWS',
 'GNU',
 'Teradata',
 '',
 'scikit-learn',
 'Machine learning',
 'Anglais',
 'Pyspark',
 'HDFS',
 'SPSS',
 'Microsoft Azure',
 'MariaDB',
 'Scikit-learn',
 'Scala',
 'Cassandra',
 'Matplotlib',
 'AI',
 'Scoring',
 'Vertica',
 'Tableau',
 'Kafka',
 'Pycharm',
 'NoSQL',
 'Deep learning',
 'PostgreSQL',
 'Yarn',
 'Map-Reduce',
 'PIG',
 'HBASE',
 'PySpark',
 'anglais',
 'numpy',
 'Redshift',
 'NoSQ',
 'Hadoop(HDFS)',
 'ORACLE',
 'MySQL',
 'Windows',
 'Perl',
 'Ruby']

In [19]:
df_new = df_new.fillna(value= "No more")

In [20]:
df_new

Unnamed: 0,Entreprise,Metier,Diplome,Experience,Ville,Exp_label,technos1,technos2,technos3,technos4,technos5,technos6,technos7
0,Sanofi,Data scientist,Master,1.000000,Paris,debutant,Matlab,Python,Pyspark,Scikit-learn,Tensorflow,No more,No more
1,Massachusetts General Hospital(MGH),Data architecte,Master,3.000000,Marseille,Confirme,Python,Java,Scala,MongoDB,No more,No more,No more
2,No more,Lead data scientist,Master,3.000000,Nantes,Confirme,SPSS,SQL,Teradata,R,Python,Tensorflow,scikit-learn
3,Ann & Robert H. Lurie Children’s Hospital of C...,Data scientist,Master,1.500000,Marseille,debutant,C,C++,Java,Python,No more,No more,No more
4,No more,Data scientist,PhD,2.255155,Bordeaux,Confirme,Matlab,Python,C++,numpy,Tensorflow,scikit-learn,No more
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,No more,Data scientist,Master,1.500000,Lyon,debutant,C++,R,Python,Matlab,Matplotlib,No more,No more
9558,Charles River Analytics,Data scientist,Master,11.000000,Toulouse,Expert,Excel,Python,Matlab,R,machine learning,anglais,No more
9559,Ball Aerospace,Data scientist,Master,1.000000,Paris,debutant,R,machine learning,Excel,VBA,No more,No more,No more
9560,No more,Data scientist,Master,17.000000,Rennes,Expert,Python,Pyspark,machine learning,Microsoft Azure,No more,No more,No more


In [21]:
df_new.drop(['Entreprise', 'Experience'], axis = 1, inplace = True) 

In [22]:
df_new

Unnamed: 0,Metier,Diplome,Ville,Exp_label,technos1,technos2,technos3,technos4,technos5,technos6,technos7
0,Data scientist,Master,Paris,debutant,Matlab,Python,Pyspark,Scikit-learn,Tensorflow,No more,No more
1,Data architecte,Master,Marseille,Confirme,Python,Java,Scala,MongoDB,No more,No more,No more
2,Lead data scientist,Master,Nantes,Confirme,SPSS,SQL,Teradata,R,Python,Tensorflow,scikit-learn
3,Data scientist,Master,Marseille,debutant,C,C++,Java,Python,No more,No more,No more
4,Data scientist,PhD,Bordeaux,Confirme,Matlab,Python,C++,numpy,Tensorflow,scikit-learn,No more
...,...,...,...,...,...,...,...,...,...,...,...
9557,Data scientist,Master,Lyon,debutant,C++,R,Python,Matlab,Matplotlib,No more,No more
9558,Data scientist,Master,Toulouse,Expert,Excel,Python,Matlab,R,machine learning,anglais,No more
9559,Data scientist,Master,Paris,debutant,R,machine learning,Excel,VBA,No more,No more,No more
9560,Data scientist,Master,Rennes,Expert,Python,Pyspark,machine learning,Microsoft Azure,No more,No more,No more


In [36]:
cb = CatBoostClassifier(n_estimators=200,
                        loss_function ='MultiClass',
                        learning_rate = 0.4,
                        depth=3,
                        task_type = 'CPU',
                        random_state = 1,
                        verbose = True)

In [25]:
X_df = df_new.drop('Metier', axis = 1)
y_df = df_new['Metier']

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_df, y_df, test_size = 0.3)

In [28]:
pool_train = Pool(X_train, y_train, cat_features=['Diplome','Ville','Exp_label','technos1','technos2','technos3','technos4','technos5','technos6','technos7'])

In [29]:
pool_val = Pool(X_val, cat_features=['Diplome','Ville','Exp_label','technos1','technos2','technos3','technos4','technos5','technos6','technos7'])

In [49]:
cb_model = cb.fit(pool_train)

0:	learn: 0.8082134	total: 50.6ms	remaining: 10.1s
1:	learn: 0.6259857	total: 107ms	remaining: 10.6s
2:	learn: 0.5244528	total: 163ms	remaining: 10.7s
3:	learn: 0.4547276	total: 218ms	remaining: 10.7s
4:	learn: 0.4067104	total: 269ms	remaining: 10.5s
5:	learn: 0.3816401	total: 361ms	remaining: 11.7s
6:	learn: 0.3639035	total: 437ms	remaining: 12s
7:	learn: 0.3527187	total: 526ms	remaining: 12.6s
8:	learn: 0.3396611	total: 619ms	remaining: 13.1s
9:	learn: 0.3197562	total: 685ms	remaining: 13s
10:	learn: 0.3074327	total: 746ms	remaining: 12.8s
11:	learn: 0.3005069	total: 814ms	remaining: 12.8s
12:	learn: 0.2927169	total: 868ms	remaining: 12.5s
13:	learn: 0.2913480	total: 921ms	remaining: 12.2s
14:	learn: 0.2898327	total: 982ms	remaining: 12.1s
15:	learn: 0.2877627	total: 1.04s	remaining: 12s
16:	learn: 0.2857173	total: 1.1s	remaining: 11.9s
17:	learn: 0.2834935	total: 1.16s	remaining: 11.7s
18:	learn: 0.2829591	total: 1.22s	remaining: 11.6s
19:	learn: 0.2816268	total: 1.28s	remaining: 11

In [50]:
y_pred = cb_model.predict(pool_val)

In [51]:
y_pred

array([['Data scientist'],
       ['Data scientist'],
       ['Data architecte'],
       ...,
       ['Data scientist'],
       ['Data scientist'],
       ['Data engineer']], dtype=object)

In [52]:
print(classification_report(y_val, y_pred))

                     precision    recall  f1-score   support

    Data architecte       0.98      0.98      0.98       650
      Data engineer       1.00      1.00      1.00       682
     Data scientist       0.83      0.91      0.87      1171
Lead data scientist       0.61      0.42      0.49       366

           accuracy                           0.88      2869
          macro avg       0.86      0.83      0.84      2869
       weighted avg       0.88      0.88      0.88      2869



In [53]:
confusion_matrix(y_val, y_pred)

array([[ 638,    0,   12,    0],
       [   0,  682,    0,    0],
       [   7,    0, 1067,   97],
       [   3,    0,  211,  152]], dtype=int64)

In [54]:
cb_model.save_model('cb_model.json')