In [1]:
import json
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv("../../../../data/ds-jobs-ner.csv")

# Remplir nulls
df["sector"]            = df["sector"].fillna("Unknown")
df["size"]              = df["size"].fillna("Unknown")
df["type_of_ownership"] = df["type_of_ownership"].fillna("Unknown")
df["state"]             = df["state"].fillna("Unknown")

print(f"Shape : {df.shape}")
print(f"Colonnes : {df.columns.tolist()}")
df.head(3)

Shape : (672, 19)
Colonnes : ['id', 'job_title_clean', 'job_description_clean', 'salary_estimate', 'salary_avg_k', 'rating', 'company_name', 'location', 'state', 'size', 'size_encoded', 'founded', 'company_age', 'type_of_ownership', 'industry', 'sector', 'revenue', 'competitors', 'skills_extracted']


Unnamed: 0,id,job_title_clean,job_description_clean,salary_estimate,salary_avg_k,rating,company_name,location,state,size,size_encoded,founded,company_age,type_of_ownership,industry,sector,revenue,competitors,skills_extracted
0,1,Senior Data Scientist,The Senior Data Scientist is responsible for d...,$137K-$171K (Glassdoor est.),154.0,3.1,Healthfirst,"New York, NY",NY,1001 to 5000 employees,5.0,1993.0,32.0,Nonprofit Organization,Insurance Carriers,Insurance,,"EmblemHealth, UnitedHealth Group, Aetna","[""AWS"", ""Azure"", ""Azure Healthcare Industry"", ..."
1,2,Data Scientist,"Secure our Nation, Ignite your Future Join th...",$137K-$171K (Glassdoor est.),154.0,4.2,ManTech,"Chantilly, VA",VA,5001 to 10000 employees,6.0,1968.0,57.0,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),,"[""Active Duty"", ""Armed Forces Services Medal"",..."
2,3,Data Scientist,Overview Analysis Group is one of the larges...,$137K-$171K (Glassdoor est.),154.0,3.8,Analysis Group,"Boston, MA",MA,1001 to 5000 employees,5.0,1981.0,44.0,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),,"[""AWS"", ""Agir"", ""Analysis Group"", ""C++"", ""Coll..."


In [3]:
tfidf_desc = TfidfVectorizer(
    max_features=200,
    stop_words="english",
    ngram_range=(1, 2),
)
X_desc = tfidf_desc.fit_transform(df["job_description_clean"])

print(f"Shape X_desc : {X_desc.shape}")
print(f"Exemple de mots captur√©s (20 premiers) :")
print(tfidf_desc.get_feature_names_out()[:20])

Shape X_desc : (672, 200)
Exemple de mots captur√©s (20 premiers) :
['ability' 'able' 'advanced' 'ai' 'algorithms' 'analysis' 'analytic'
 'analytical' 'analytics' 'analyze' 'applicants' 'application'
 'applications' 'applied' 'apply' 'bachelor' 'based' 'benefits' 'best'
 'big']


In [4]:
tfidf_title = TfidfVectorizer(
    max_features=50,
    stop_words="english",
    ngram_range=(1, 2),
)
X_title = tfidf_title.fit_transform(df["job_title_clean"])

print(f"Shape X_title : {X_title.shape}")
print(f"Mots du titre captur√©s :")
print(tfidf_title.get_feature_names_out())

Shape X_title : (672, 50)
Mots du titre captur√©s :
['ai' 'analyst' 'analytics' 'applied' 'area' 'associate' 'bay' 'bay area'
 'business' 'business intelligence' 'computer' 'computer scientist' 'data'
 'data analyst' 'data engineer' 'data science' 'data scientist' 'engineer'
 'image' 'intelligence' 'intelligence analyst' 'lead' 'learning'
 'learning engineer' 'learning scientist' 'machine' 'machine learning'
 'manager' 'principal' 'principal data' 'required' 'research'
 'research computer' 'sci' 'science' 'scientist' 'scientist image'
 'scientist machine' 'scientist research' 'scientist signal'
 'scientist ts' 'senior' 'senior data' 'senior machine' 'signal'
 'software' 'software engineer' 'staff' 'ts' 'ts sci']


In [5]:
CAT_COLS = ["sector", "size", "type_of_ownership", "state"]

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
)
X_cat = csr_matrix(encoder.fit_transform(df[CAT_COLS]))

print(f"Shape X_cat : {X_cat.shape}")
print(f"\nExemple encodage 'sector' :")
mapping = dict(zip(encoder.categories_[0], range(len(encoder.categories_[0]))))
for k, v in list(mapping.items())[:8]:
    print(f"  {k:<30} ‚Üí {v}")

Shape X_cat : (672, 4)

Exemple encodage 'sector' :
  Accounting & Legal             ‚Üí 0
  Aerospace & Defense            ‚Üí 1
  Agriculture & Forestry         ‚Üí 2
  Biotech & Pharmaceuticals      ‚Üí 3
  Business Services              ‚Üí 4
  Construction, Repair & Maintenance ‚Üí 5
  Consumer Services              ‚Üí 6
  Education                      ‚Üí 7


In [6]:
X = hstack([X_desc, X_title, X_cat])
y = df["salary_avg_k"]

print(f"X_desc  : {X_desc.shape}")
print(f"X_title : {X_title.shape}")
print(f"X_cat   : {X_cat.shape}")
print(f"‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"X final : {X.shape}  (200 + 50 + 4 = 254 features)")
print(f"y       : {y.shape}  ‚Üí salary_avg_k")
print(f"\nDistribution target :")
print(y.describe().round(1))

X_desc  : (672, 200)
X_title : (672, 50)
X_cat   : (672, 4)
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
X final : (672, 254)  (200 + 50 + 4 = 254 features)
y       : (672,)  ‚Üí salary_avg_k

Distribution target :
count    672.0
mean     123.7
std       39.6
min       43.5
25%      103.0
50%      114.0
75%      136.5
max      271.5
Name: salary_avg_k, dtype: float64


### Split train/test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train : {X_train.shape[0]} lignes ({X_train.shape[0]/len(df)*100:.0f}%)")
print(f"Test  : {X_test.shape[0]} lignes  ({X_test.shape[0]/len(df)*100:.0f}%)")

Train : 537 lignes (80%)
Test  : 135 lignes  (20%)


### Entra√Ænement

In [8]:
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train, y_train)

print(" Mod√®le entra√Æn√© !")
print(f"   Nombre d'arbres : {model.n_estimators}")
print(f"   Profondeur max  : {model.max_depth}")

 Mod√®le entra√Æn√© !
   Nombre d'arbres : 200
   Profondeur max  : 10


### √âvaluation

In [9]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"MAE : {mae:.1f} K$  (erreur moyenne de pr√©diction)")
print(f"R¬≤  : {r2:.3f}")

# Visualiser pr√©dit vs r√©el
comparison = pd.DataFrame({"R√©el": y_test.values, "Pr√©dit": y_pred.round(1)})
comparison["Erreur"] = (comparison["Pr√©dit"] - comparison["R√©el"]).abs().round(1)
print(f"\nAper√ßu pr√©dit vs r√©el :")
comparison.head(10)

MAE : 27.2 K$  (erreur moyenne de pr√©diction)
R¬≤  : -0.099

Aper√ßu pr√©dit vs r√©el :


Unnamed: 0,R√©el,Pr√©dit,Erreur
0,134.0,117.8,16.2
1,133.0,128.1,4.9
2,43.5,118.6,75.1
3,123.5,130.5,7.0
4,120.5,126.8,6.3
5,134.0,117.8,16.2
6,185.0,133.6,51.4
7,92.5,131.5,39.0
8,89.0,109.3,20.3
9,105.0,126.4,21.4


In [10]:
feat_names = (
    list(tfidf_desc.get_feature_names_out()) +
    ["title_" + f for f in tfidf_title.get_feature_names_out()] +
    CAT_COLS
)

importances = pd.Series(model.feature_importances_, index=feat_names)
top20 = importances.sort_values(ascending=False).head(20)

print("=== TOP 20 FEATURES ===")
for feat, score in top20.items():
    src = "üìù" if feat.startswith("title_") else ("üè∑Ô∏è" if feat in CAT_COLS else "üìÑ")
    bar = "‚ñà" * int(score * 300)
    print(f"  {src} {feat:<28} {bar} {score:.4f}")

=== TOP 20 FEATURES ===
  üìÑ years                        ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0375
  üìÑ development                  ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0365
  üìÑ skills                       ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0235
  üìÑ include                      ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0230
  üìÑ analytic                     ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0224
  üìÑ applied                      ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0220
  üìÑ support                      ‚ñà‚ñà‚ñà‚ñà‚ñà 0.0184
  üìÑ medical                      ‚ñà‚ñà‚ñà‚ñà‚ñà 0.0182
  üìÑ high                         ‚ñà‚ñà‚ñà‚ñà 0.0166
  üìÑ develop                      ‚ñà‚ñà‚ñà‚ñà 0.0156
  üìÑ science                      ‚ñà‚ñà‚ñà‚ñà 0.0141
  üìÑ software                     ‚ñà‚ñà‚ñà 0.0128
  üìÑ using                        ‚ñà‚ñà‚ñà 0.0125
  üìÑ advanced                     ‚ñà‚ñà‚ñà 0.0116
  üìÑ statistics                   ‚ñà‚ñà‚ñà 0.0114
  üìÑ identify                     ‚ñà‚ñà‚ñà 0.0107
  üìÑ years experience