In [11]:
import json
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [12]:
df = pd.read_csv("../../../../data/ds-jobs-clean.csv")

# Remplir nulls
df["sector"]            = df["sector"].fillna("Unknown")
df["size"]              = df["size"].fillna("Unknown")
df["type_of_ownership"] = df["type_of_ownership"].fillna("Unknown")
df["state"]             = df["state"].fillna("Unknown")

print(f"Shape : {df.shape}")
print(f"Colonnes : {df.columns.tolist()}")
df.head(3)

Shape : (631, 18)
Colonnes : ['job_title_clean', 'job_description_clean', 'salary_estimate', 'salary_avg_k', 'rating', 'company_name', 'location', 'state', 'size', 'size_encoded', 'founded', 'company_age', 'type_of_ownership', 'industry', 'sector', 'revenue', 'competitors', 'extracted_skills']


Unnamed: 0,job_title_clean,job_description_clean,salary_estimate,salary_avg_k,rating,company_name,location,state,size,size_encoded,founded,company_age,type_of_ownership,industry,sector,revenue,competitors,extracted_skills
0,Senior Data Scientist,The Senior Data Scientist is responsible for d...,$137K-$171K (Glassdoor est.),154.0,3.1,Healthfirst,"New York, NY",NY,1001 to 5000 employees,5.0,1993.0,32.0,Nonprofit Organization,Insurance Carriers,Insurance,,"EmblemHealth, UnitedHealth Group, Aetna","['forecast', 'data', 'technologies', 'multivar..."
1,Data Scientist,"Secure our Nation, Ignite your Future Join th...",$137K-$171K (Glassdoor est.),154.0,4.2,ManTech,"Chantilly, VA",VA,5001 to 10000 employees,6.0,1968.0,57.0,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),,"['math', 'technologies', 'code', 'distributed'..."
2,Data Scientist,Overview Analysis Group is one of the larges...,$137K-$171K (Glassdoor est.),154.0,3.8,Analysis Group,"Boston, MA",MA,1001 to 5000 employees,5.0,1981.0,44.0,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),,"['written', 'collaborating', 'recherche', 'd3)..."


In [13]:
tfidf_desc = TfidfVectorizer(
    max_features=200,
    stop_words="english",
    ngram_range=(1, 2),
)
X_desc = tfidf_desc.fit_transform(df["job_description_clean"])

print(f"Shape X_desc : {X_desc.shape}")
print(f"Exemple de mots captur√©s (20 premiers) :")
print(tfidf_desc.get_feature_names_out()[:20])

Shape X_desc : (631, 200)
Exemple de mots captur√©s (20 premiers) :
['ability' 'able' 'advanced' 'ai' 'algorithms' 'analysis' 'analytic'
 'analytical' 'analytics' 'analyze' 'applicants' 'application'
 'applications' 'applied' 'apply' 'bachelor' 'based' 'benefits' 'best'
 'big']


In [14]:
tfidf_title = TfidfVectorizer(
    max_features=50,
    stop_words="english",
    ngram_range=(1, 2),
)
X_title = tfidf_title.fit_transform(df["job_title_clean"])

print(f"Shape X_title : {X_title.shape}")
print(f"Mots du titre captur√©s :")
print(tfidf_title.get_feature_names_out())

Shape X_title : (631, 50)
Mots du titre captur√©s :
['ai' 'analyst' 'analytical' 'analytics' 'applied' 'area' 'associate'
 'bay' 'bay area' 'business' 'computer' 'computer scientist' 'data'
 'data analyst' 'data engineer' 'data modeler' 'data science'
 'data scientist' 'engineer' 'health' 'image' 'intelligence'
 'intelligence analyst' 'lead' 'learning' 'learning engineer'
 'learning scientist' 'machine' 'machine learning' 'manager' 'modeler'
 'patient' 'principal' 'principal data' 'required' 'research' 'sci'
 'science' 'scientist' 'scientist image' 'scientist machine'
 'scientist ts' 'senior' 'senior data' 'senior machine' 'software'
 'software engineer' 'staff' 'ts' 'ts sci']


In [15]:
CAT_COLS = ["sector", "size", "type_of_ownership", "state"]

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
)
X_cat = csr_matrix(encoder.fit_transform(df[CAT_COLS]))

print(f"Shape X_cat : {X_cat.shape}")
print(f"\nExemple encodage 'sector' :")
mapping = dict(zip(encoder.categories_[0], range(len(encoder.categories_[0]))))
for k, v in list(mapping.items())[:8]:
    print(f"  {k:<30} ‚Üí {v}")

Shape X_cat : (631, 4)

Exemple encodage 'sector' :
  Accounting & Legal             ‚Üí 0
  Aerospace & Defense            ‚Üí 1
  Agriculture & Forestry         ‚Üí 2
  Biotech & Pharmaceuticals      ‚Üí 3
  Business Services              ‚Üí 4
  Construction, Repair & Maintenance ‚Üí 5
  Consumer Services              ‚Üí 6
  Education                      ‚Üí 7


In [16]:
X = hstack([X_desc, X_title, X_cat])
y = df["salary_avg_k"]

print(f"X_desc  : {X_desc.shape}")
print(f"X_title : {X_title.shape}")
print(f"X_cat   : {X_cat.shape}")
print(f"‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
print(f"X final : {X.shape}  (200 + 50 + 4 = 254 features)")
print(f"y       : {y.shape}  ‚Üí salary_avg_k")
print(f"\nDistribution target :")
print(y.describe().round(1))

X_desc  : (631, 200)
X_title : (631, 50)
X_cat   : (631, 4)
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
X final : (631, 254)  (200 + 50 + 4 = 254 features)
y       : (631,)  ‚Üí salary_avg_k

Distribution target :
count    631.0
mean     121.3
std       27.0
min       76.5
25%      103.0
50%      114.0
75%      136.0
max      185.0
Name: salary_avg_k, dtype: float64


### Split train/test

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train : {X_train.shape[0]} lignes ({X_train.shape[0]/len(df)*100:.0f}%)")
print(f"Test  : {X_test.shape[0]} lignes  ({X_test.shape[0]/len(df)*100:.0f}%)")

Train : 504 lignes (80%)
Test  : 127 lignes  (20%)


### Entra√Ænement

In [18]:
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)
model.fit(X_train, y_train)

print(" Mod√®le entra√Æn√© !")
print(f"   Nombre d'arbres : {model.n_estimators}")
print(f"   Profondeur max  : {model.max_depth}")

 Mod√®le entra√Æn√© !
   Nombre d'arbres : 200
   Profondeur max  : 10


### √âvaluation

In [19]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"MAE : {mae:.1f} K$  (erreur moyenne de pr√©diction)")
print(f"R¬≤  : {r2:.3f}")

# Visualiser pr√©dit vs r√©el
comparison = pd.DataFrame({"R√©el": y_test.values, "Pr√©dit": y_pred.round(1)})
comparison["Erreur"] = (comparison["Pr√©dit"] - comparison["R√©el"]).abs().round(1)
print(f"\nAper√ßu pr√©dit vs r√©el :")
comparison.head(10)

MAE : 21.8 K$  (erreur moyenne de pr√©diction)
R¬≤  : -0.015

Aper√ßu pr√©dit vs r√©el :


Unnamed: 0,R√©el,Pr√©dit,Erreur
0,164.5,112.2,52.3
1,120.5,120.9,0.4
2,92.5,108.5,16.0
3,99.5,109.8,10.3
4,89.0,121.7,32.7
5,97.0,125.0,28.0
6,133.0,119.4,13.6
7,105.0,115.0,10.0
8,148.0,131.5,16.5
9,133.0,127.9,5.1


In [20]:
feat_names = (
    list(tfidf_desc.get_feature_names_out()) +
    ["title_" + f for f in tfidf_title.get_feature_names_out()] +
    CAT_COLS
)

importances = pd.Series(model.feature_importances_, index=feat_names)
top20 = importances.sort_values(ascending=False).head(20)

print("=== TOP 20 FEATURES ===")
for feat, score in top20.items():
    src = "üìù" if feat.startswith("title_") else ("üè∑Ô∏è" if feat in CAT_COLS else "üìÑ")
    bar = "‚ñà" * int(score * 300)
    print(f"  {src} {feat:<28} {bar} {score:.4f}")

=== TOP 20 FEATURES ===
  üìÑ using                        ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0406
  üìÑ algorithms                   ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0304
  üìÑ business                     ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0231
  üìÑ skills                       ‚ñà‚ñà‚ñà‚ñà‚ñà 0.0193
  üìÑ data                         ‚ñà‚ñà‚ñà‚ñà 0.0163
  üìÑ technology                   ‚ñà‚ñà‚ñà‚ñà 0.0162
  üìÑ years                        ‚ñà‚ñà‚ñà‚ñà 0.0152
  üìÑ experience                   ‚ñà‚ñà‚ñà‚ñà 0.0151
  üìÑ building                     ‚ñà‚ñà‚ñà‚ñà 0.0136
  üìÑ applied                      ‚ñà‚ñà‚ñà 0.0124
  üìÑ data analysis                ‚ñà‚ñà‚ñà 0.0117
  üìÑ identify                     ‚ñà‚ñà‚ñà 0.0109
  üìÑ python                       ‚ñà‚ñà‚ñà 0.0108
  üìÑ advanced                     ‚ñà‚ñà‚ñà 0.0106
  üìÑ work                         ‚ñà‚ñà‚ñà 0.0105
  üìÑ engineers                    ‚ñà‚ñà‚ñà 0.0104
  üìÑ solutions                    ‚ñà‚ñà‚ñà 0.0103
