In [1]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict
import mlflow
import mlflow.sklearn

# Initialize MLflow
mlflow.set_experiment("Job_Domain_Classification")

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

with mlflow.start_run(run_name="Random_Forest_Classifier"):
    grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X_train, y_train)

    # Best Random Forest model
    best_rf_classifier = grid_search_rf.best_estimator_

    # Log model and parameters
    mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model")
    mlflow.log_params(grid_search_rf.best_params_)
    rf_predictions = best_rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    mlflow.log_metric("accuracy", rf_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, rf_predictions))
    print("Random Forest Accuracy:", rf_accuracy)

with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
    gbm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(gbm_classifier, "gbm_model")
    gbm_predictions = gbm_classifier.predict(X_test)
    gbm_accuracy = accuracy_score(y_test, gbm_predictions)
    mlflow.log_metric("accuracy", gbm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, gbm_predictions))
    print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

with mlflow.start_run(run_name="Support_Vector_Machine_Classifier"):
    svm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(svm_classifier, "svm_model")
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    mlflow.log_metric("accuracy", svm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, svm_predictions))
    print("Support Vector Machine Accuracy:", svm_accuracy)

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)





ModuleNotFoundError: No module named 'mlflow'

In [3]:
pip install mlflow


Note: you may need to restart the kernel to use updated packages.




In [4]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict
import mlflow
import mlflow.sklearn

# Initialize MLflow
mlflow.set_experiment("Job_Domain_Classification")

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

with mlflow.start_run(run_name="Random_Forest_Classifier"):
    grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X_train, y_train)

    # Best Random Forest model
    best_rf_classifier = grid_search_rf.best_estimator_

    # Log model and parameters
    mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model")
    mlflow.log_params(grid_search_rf.best_params_)
    rf_predictions = best_rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    mlflow.log_metric("accuracy", rf_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, rf_predictions))
    print("Random Forest Accuracy:", rf_accuracy)

with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
    gbm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(gbm_classifier, "gbm_model")
    gbm_predictions = gbm_classifier.predict(X_test)
    gbm_accuracy = accuracy_score(y_test, gbm_predictions)
    mlflow.log_metric("accuracy", gbm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, gbm_predictions))
    print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

with mlflow.start_run(run_name="Support_Vector_Machine_Classifier"):
    svm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(svm_classifier, "svm_model")
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    mlflow.log_metric("accuracy", svm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, svm_predictions))
    print("Support Vector Machine Accuracy:", svm_accuracy)

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)



2024/06/14 12:46:32 INFO mlflow.tracking.fluent: Experiment with name 'Job_Domain_Classification' does not exist. Creating a new experiment.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use

Fitting 5 folds for each of 36 candidates, totalling 180 fits




AttributeError: module 'mlflow' has no attribute 'log_confusion_matrix'

In [5]:
pip install matplotlib


Note: you may need to restart the kernel to use updated packages.




In [6]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict
import mlflow
import mlflow.sklearn

# Initialize MLflow
mlflow.set_experiment("Job_Domain_Classification")

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

with mlflow.start_run(run_name="Random_Forest_Classifier"):
    grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X_train, y_train)

    # Best Random Forest model
    best_rf_classifier = grid_search_rf.best_estimator_

    # Log model and parameters
    mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model")
    mlflow.log_params(grid_search_rf.best_params_)
    rf_predictions = best_rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    mlflow.log_metric("accuracy", rf_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, rf_predictions))
    print("Random Forest Accuracy:", rf_accuracy)

with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
    gbm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(gbm_classifier, "gbm_model")
    gbm_predictions = gbm_classifier.predict(X_test)
    gbm_accuracy = accuracy_score(y_test, gbm_predictions)
    mlflow.log_metric("accuracy", gbm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, gbm_predictions))
    print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

with mlflow.start_run(run_name="Support_Vector_Machine_Classifier"):
    svm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(svm_classifier, "svm_model")
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    mlflow.log_metric("accuracy", svm_accuracy)
    mlflow.log_confusion_matrix("confusion_matrix", confusion_matrix(y_test, svm_predictions))
    print("Support Vector Machine Accuracy:", svm_accuracy)

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 36 candidates, totalling 180 fits


AttributeError: module 'mlflow' has no attribute 'log_confusion_matrix'

In [3]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt

# Initialize MLflow
mlflow.set_experiment("Job_Domain_Classification")

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

def log_confusion_matrix(y_true, y_pred, artifact_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_rf_classifier.classes_)
    disp.plot()
    plt.savefig(artifact_name)
    plt.close()
    mlflow.log_artifact(artifact_name)

with mlflow.start_run(run_name="Random_Forest_Classifier"):
    grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X_train, y_train)

    # Best Random Forest model
    best_rf_classifier = grid_search_rf.best_estimator_

    # Log model and parameters
    mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model")
    mlflow.log_params(grid_search_rf.best_params_)
    rf_predictions = best_rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    mlflow.log_metric("accuracy", rf_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, rf_predictions, "confusion_matrix_rf.png")
    
    print("Random Forest Accuracy:", rf_accuracy)

with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
    gbm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(gbm_classifier, "gbm_model")
    gbm_predictions = gbm_classifier.predict(X_test)
    gbm_accuracy = accuracy_score(y_test, gbm_predictions)
    mlflow.log_metric("accuracy", gbm_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, gbm_predictions, "confusion_matrix_gbm.png")
    
    print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

with mlflow.start_run(run_name="Support_Vector_Machine_Classifier"):
    svm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(svm_classifier, "svm_model")
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    mlflow.log_metric("accuracy", svm_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, svm_predictions, "confusion_matrix_svm.png")
    
    print("Support Vector Machine Accuracy:", svm_accuracy)

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a r

Fitting 5 folds for each of 36 candidates, totalling 180 fits




Random Forest Accuracy: 0.9945054945054945
Gradient Boosting Machine Accuracy: 0.978021978021978
Support Vector Machine Accuracy: 0.9945054945054945


In [4]:
mlflow ui


SyntaxError: invalid syntax (2385425841.py, line 1)

In [5]:
!pip install mlflow





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [8]:
!mlflow ui


^C


# Use of MLflow

Use of MLflow:
MLflow is used in this script to track and log experiments systematically:

Experiment Setup:

mlflow.set_experiment("Job_Domain_Classification") ensures all runs are grouped under a specific experiment.
Starting a Run:

mlflow.start_run(run_name="Random_Forest_Classifier"): Each model training is enclosed within an mlflow.start_run context. This starts a new run for logging.
Logging Models and Parameters:

mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model"): Logs the trained model.
mlflow.log_params(grid_search_rf.best_params_): Logs the best hyperparameters found during grid search.
Logging Metrics:

mlflow.log_metric("accuracy", rf_accuracy): Logs the accuracy of the model on the test set.
Logging Artifacts:

mlflow.log_artifact("confusion_matrix_rf.png"): Logs the confusion matrix as an image artifact.
Benefits of Using MLflow:
Experiment Tracking:

MLflow provides a systematic way to track and organize experiments, including different runs, hyperparameters, metrics, and artifacts.
Reproducibility:

By logging all details of each run, it ensures that experiments are reproducible.
Model Management:

MLflow helps in managing different versions of models, making it easy to compare and select the best-performing models.
Visualization:

The MLflow UI provides a user-friendly interface to visualize and compare different runs, metrics, and artifacts.
By integrating MLflow into the machine learning pipeline, the code not only performs model training and evaluation but also systematically logs all relevant details, making it easier to manage and track the experiments.

In [10]:
import pandas as pd
import json
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib
from fuzzywuzzy import fuzz
from collections import defaultdict
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt

# Initialize MLflow
mlflow.set_experiment("Job_Domain_Classification")

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

def log_confusion_matrix(y_true, y_pred, artifact_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_rf_classifier.classes_)
    disp.plot()
    plt.savefig(artifact_name)
    plt.close()
    mlflow.log_artifact(artifact_name)

with mlflow.start_run(run_name="Random_Forest_Classifier"):
    grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X_train, y_train)

    # Best Random Forest model
    best_rf_classifier = grid_search_rf.best_estimator_

    # Log model and parameters
    mlflow.sklearn.log_model(best_rf_classifier, "best_rf_model")
    mlflow.log_params(grid_search_rf.best_params_)
    rf_predictions = best_rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    mlflow.log_metric("accuracy", rf_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, rf_predictions, "confusion_matrix_rf.png")
    
    print("Random Forest Accuracy:", rf_accuracy)

with mlflow.start_run(run_name="Gradient_Boosting_Classifier"):
    gbm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(gbm_classifier, "gbm_model")
    gbm_predictions = gbm_classifier.predict(X_test)
    gbm_accuracy = accuracy_score(y_test, gbm_predictions)
    mlflow.log_metric("accuracy", gbm_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, gbm_predictions, "confusion_matrix_gbm.png")
    
    print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

with mlflow.start_run(run_name="Support_Vector_Machine_Classifier"):
    svm_classifier.fit(X_train, y_train)
    mlflow.sklearn.log_model(svm_classifier, "svm_model")
    svm_predictions = svm_classifier.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_predictions)
    mlflow.log_metric("accuracy", svm_accuracy)
    
    # Log confusion matrix as an image artifact
    log_confusion_matrix(y_test, svm_predictions, "confusion_matrix_svm.png")
    
    print("Support Vector Machine Accuracy:", svm_accuracy)

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest Accuracy: 0.9945054945054945
Gradient Boosting Machine Accuracy: 0.978021978021978
Support Vector Machine Accuracy: 0.9945054945054945


['tfidf_vectorizer.joblib']