In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text'])

# Encode the skills
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-label classifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'F1 Score (Micro): {f1}')
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Remove rows with empty skill sets
data = data[data['skills'].map(len) > 0]

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text'])

# Encode the skills
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

# Check if there are any classes with all zeros and remove them
y = y[:, y.sum(axis=0) > 0]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-label classifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'F1 Score (Micro): {f1}')
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Print the first few rows to check the extracted skills
print(data[['cleaned_text', 'skills']].head())

# Remove rows with empty skill sets
data = data[data['skills'].map(len) > 0]

# Ensure there is more than one class in the target variable
if len(data) == 0:
    raise ValueError("No data available after skill extraction. Please check the skill extraction process.")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text'])

# Encode the skills
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

# Check class distribution
print("Class distribution (number of positive samples per class):", y.sum(axis=0))

# Ensure each class has more than one sample
if len(set(y.sum(axis=0))) == 1:
    raise ValueError("The data contains only one class. Please ensure there's variability in the skills.")

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-label classifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'F1 Score (Micro): {f1}')
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        cleaned_text  \
0  looking intern responsibilities responsibiliti...   
1  internship job description human resource inte...   
2  marketing intern lyskraft integral part market...   
3  job description good advanced excel vlookup pi...   
4  easyv seeking dynamic enthusiastic product man...   

                                              skills  
0  [kotlin, jetpack compose, android sdk, firebas...  
1  [excel, hr, communication, multitasking, micro...  
2  [marketing, social media, content creation, co...  
3  [excel, sales, hr, operations, communication, ...  
4  [rest, excel, marketing, product management, s...  
Class distribution (number of positive samples per class): [  7 124  16   1  13   7   1  13   1  23  99   8   3   6   1   2  87   1
  30   2  13  12   1   1   1   5   5   8  21  54  13   3  28   8   5  37
   1   8   3  13   8  21 170   9   6  49  29  37   2   4  22  18   1   1
   4   3]


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [6]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Remove rows with empty skill sets
data = data[data['skills'].map(len) > 0]

# Ensure there are at least two classes present
unique_classes = data['skills'].explode().unique()
if len(unique_classes) < 2:
    raise ValueError("There are not enough unique classes present in the data.")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text'])

# Encode the skills
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-label classifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'F1 Score (Micro): {f1}')
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [7]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Remove rows with empty skill sets
data = data[data['skills'].map(len) > 0]

# Ensure there are at least two classes present
unique_classes = data['skills'].explode().unique()
if len(unique_classes) < 2:
    raise ValueError("There are not enough unique classes present in the data.")

# Print unique classes and shape of encoded labels
print("Unique Classes:", unique_classes)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])
print("Shape of encoded labels:", y.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a multi-label classifier
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
hamming = hamming_loss(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
accuracy = accuracy_score(y_test, y_pred)

print(f'Hamming Loss: {hamming}')
print(f'F1 Score (Micro): {f1}')
print(f'Accuracy: {accuracy}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes: ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels: (170, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [8]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure there are at least two unique classes present after filtering
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer()
y_filtered = mlb.fit_transform(data_filtered['skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_filtered = tfidf.fit_transform(data_filtered['cleaned_text'])

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels (Filtered): (170, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [9]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk
import numpy as np

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# List of skills
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function for text cleaning and skills extraction
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    extracted_skills = [skill for skill in skills_list if skill in text]
    return ' '.join(extracted_skills)

# Apply preprocessing
data['extracted_skills'] = data['job_description'].apply(preprocess_text)

# Filter out samples without extracted skills
data_filtered = data[data['extracted_skills'].apply(lambda x: len(x) > 0)]

# Count the number of samples per class
class_counts = data_filtered['extracted_skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data_filtered[data_filtered['extracted_skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure there are at least two unique classes present after filtering
unique_classes_filtered = data_filtered['extracted_skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer()
y_filtered = mlb.fit_transform(data_filtered['extracted_skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(data_filtered['extracted_skills'])

# Convert skills list into a binary vector representation
skills_vector = np.array([[1 if skill in skills else 0 for skill in skills_list] for skills in data_filtered['extracted_skills']])

# Concatenate TF-IDF vector and skills vector
X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin jetpack compose android sdk firebase rest json proto r'
 'excel hr communication multitasking microsoft office ai r recruitment'
 'marketing social media content creation communication ai r'
 'excel sales hr operations communication ai r'
 'rest excel marketing product management sales research problem solving communication collaboration ai r market research'
 'marketing product management research communication collaboration ai r market research'
 'python proto research communication ai r'
 'excel marketing sales research operations analytical skills communication organizational skills microsoft office ai r market research inventory management'
 'python sql excel data visualization marketing sales hr research problem solving communication ai r sql b2b'
 'sales research ai r' 'product management problem solving r'
 'excel hr problem solving communication r' 'excel marketing ai r'
 'rest excel hr operations communication organizational skills ai r rec

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [10]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk
import numpy as np

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# List of skills
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function for text cleaning and skills extraction
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    extracted_skills = [skill for skill in skills_list if skill in text]
    return ' '.join(extracted_skills)

# Apply preprocessing
data['extracted_skills'] = data['job_description'].apply(preprocess_text)

# Filter out samples without extracted skills
data_filtered = data[data['extracted_skills'].apply(lambda x: len(x) > 0)]

# Count the number of samples per class
class_counts = data_filtered['extracted_skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data_filtered[data_filtered['extracted_skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure there are at least two unique classes present after filtering
unique_classes_filtered = data_filtered['extracted_skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer()
y_filtered = mlb.fit_transform(data_filtered['extracted_skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Filter out samples with only one class
samples_with_multiple_classes = data_filtered[data_filtered['extracted_skills'].apply(lambda x: len(x.split()) > 1)]

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(samples_with_multiple_classes['extracted_skills'])

# Convert skills list into a binary vector representation
skills_vector = np.array([[1 if skill in skills else 0 for skill in skills_list] for skills in samples_with_multiple_classes['extracted_skills']])

# Concatenate TF-IDF vector and skills vector
X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin jetpack compose android sdk firebase rest json proto r'
 'excel hr communication multitasking microsoft office ai r recruitment'
 'marketing social media content creation communication ai r'
 'excel sales hr operations communication ai r'
 'rest excel marketing product management sales research problem solving communication collaboration ai r market research'
 'marketing product management research communication collaboration ai r market research'
 'python proto research communication ai r'
 'excel marketing sales research operations analytical skills communication organizational skills microsoft office ai r market research inventory management'
 'python sql excel data visualization marketing sales hr research problem solving communication ai r sql b2b'
 'sales research ai r' 'product management problem solving r'
 'excel hr problem solving communication r' 'excel marketing ai r'
 'rest excel hr operations communication organizational skills ai r rec

ValueError: Found input variables with inconsistent numbers of samples: [162, 170]

In [11]:
print("Shape of X_text_filtered:", X_text_filtered.shape)
print("Shape of skills_vector:", skills_vector.shape)
print("Shape of y_filtered:", y_filtered.shape)

Shape of X_text_filtered: (162, 70)
Shape of skills_vector: (162, 68)
Shape of y_filtered: (170, 28)


In [12]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure there are at least two unique classes present after filtering
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer()
y_filtered = mlb.fit_transform(data_filtered['skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(data_filtered['cleaned_text'])

# Concatenate TF-IDF features and skills vector
skills_vector = np.zeros((X_text_filtered.shape[0], len(skills_list)), dtype=int)
for i, skills in enumerate(data_filtered['skills']):
    for skill in skills:
        skills_vector[i, skills_list.index(skill)] = 1

X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels (Filtered): (170, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [13]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure there are at least two unique classes present after filtering
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer(classes=unique_classes_filtered)
y_filtered = mlb.fit_transform(data_filtered['skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(data_filtered['cleaned_text'])

# Concatenate TF-IDF features and skills vector
skills_vector = np.zeros((X_text_filtered.shape[0], len(unique_classes_filtered)), dtype=int)
for i, skills in enumerate(data_filtered['skills']):
    for skill in skills:
        skills_vector[i, np.where(unique_classes_filtered == skill)[0][0]] = 1

X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels (Filtered): (170, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [14]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Ensure each class has at least two samples
min_class_samples = 2
class_counts_filtered = data_filtered['skills'].explode().value_counts()
valid_classes_filtered = class_counts_filtered[class_counts_filtered >= min_class_samples].index

# Filter data again to keep only samples with valid classes
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: any(skill in valid_classes_filtered for skill in x))]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and shape of encoded labels again
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer(classes=unique_classes_filtered)
y_filtered = mlb.fit_transform(data_filtered['skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(data_filtered['cleaned_text'])

# Concatenate TF-IDF features and skills vector
skills_vector = np.zeros((X_text_filtered.shape[0], len(skills_list)), dtype=int)
for i, skills in enumerate(data_filtered['skills']):
    for skill in skills:
        skills_vector[i, skills_list.index(skill)] = 1

X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels (Filtered): (170, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [15]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and shape of encoded labels
print("Unique Classes (Filtered):", unique_classes_filtered)
mlb = MultiLabelBinarizer()
y_filtered = mlb.fit_transform(data_filtered['skills'])
print("Shape of encoded labels (Filtered):", y_filtered.shape)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X_text_filtered = tfidf.fit_transform(data_filtered['cleaned_text'])

# Concatenate TF-IDF features and skills vector
skills_vector = np.zeros((X_text_filtered.shape[0], len(skills_list)), dtype=int)
for i, skills in enumerate(data_filtered['skills']):
    for skill in skills:
        skills_vector[i, skills_list.index(skill)] = 1

X_filtered = np.hstack((X_text_filtered.toarray(), skills_vector))

# Split the filtered data into training and test sets
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate evaluation metrics for filtered data
hamming_filtered = hamming_loss(y_test_filtered, y_pred_filtered)
f1_filtered = f1_score(y_test_filtered, y_pred_filtered, average='micro')
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)

print(f'Hamming Loss (Filtered): {hamming_filtered}')
print(f'F1 Score (Micro) (Filtered): {f1_filtered}')
print(f'Accuracy (Filtered): {accuracy_filtered}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Shape of encoded labels (Filtered): (161, 56)


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [16]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Remove samples belonging to classes with only one sample after the filtering process
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their
# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Transform skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data_filtered['skills'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
X = vectorizer.fit_transform(data_filtered['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train, y_train)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Counts per Class (Filtered):
r                          161
ai                         124
communication               99
excel                       87
marketing                   54
research

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [17]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Transform skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data_filtered['skills'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
X = vectorizer.fit_transform(data_filtered['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train, y_train)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Counts per Class (Filtered):
r                          161
ai                         124
communication               99
excel                       87
marketing                   54
research

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [18]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Transform skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data_filtered['skills'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
X = vectorizer.fit_transform(data_filtered['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train, y_train)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Counts per Class (Filtered):
r                          161
ai                         124
communication               99
excel                       87
marketing                   54
research

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [19]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Transform skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data_filtered['skills'])

# Filter out outlier classes
class_counts_filtered = np.sum(labels, axis=0)
valid_class_indices = np.where(class_counts_filtered >= 2)[0]

# Filter the labels and keep only valid classes
filtered_labels = labels[:, valid_class_indices]

# Filter the TF-IDF features based on valid classes
X_filtered = X[:, valid_class_indices]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
X = vectorizer.fit_transform(data_filtered['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_filtered, filtered_labels, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train, y_train)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['kotlin' 'jetpack compose' 'android sdk' 'firebase' 'rest' 'json' 'proto'
 'r' 'excel' 'hr' 'communication' 'multitasking' 'microsoft office' 'ai'
 'recruitment' 'marketing' 'social media' 'content creation' 'sales'
 'operations' 'product management' 'research' 'problem solving'
 'collaboration' 'market research' 'python' 'analytical skills'
 'organizational skills' 'inventory management' 'sql' 'data visualization'
 'b2b' 'java' 'javascript' 'product marketing' 'project management'
 'business development' 'lead generation' 'seo' 'aws' 'logistics'
 'procurement' 'supply chain management' 'machine learning' 'tableau'
 'power bi' 'react' 'big data' 'agile' 'email marketing' 'cloud computing'
 'scrum' 'statistical analysis' 'content strategy' 'tensorflow'
 'deep learning']
Counts per Class (Filtered):
r                          161
ai                         124
communication               99
excel                       87
marketing                   54
research

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [20]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    raise ValueError("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")

# Print unique classes and their counts
print("Unique Classes (Filtered):", unique_classes_filtered)
print("Counts per Class (Filtered):")
print(data_filtered['skills'].explode().value_counts())

# Transform skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data_filtered['skills'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
X = vectorizer.fit_transform(data_filtered['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train, y_train)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['rest' 'proto' 'r' 'excel' 'hr' 'communication' 'multitasking'
 'microsoft office' 'ai' 'recruitment' 'marketing' 'social media'
 'content creation' 'sales' 'operations' 'product management' 'research'
 'problem solving' 'collaboration' 'market research' 'python'
 'analytical skills' 'organizational skills' 'inventory management' 'sql'
 'data visualization' 'b2b' 'java' 'javascript' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo' 'aws'
 'logistics' 'machine learning' 'tableau' 'power bi' 'react' 'agile'
 'email marketing' 'scrum' 'content strategy' 'tensorflow']
Counts per Class (Filtered):
r                        161
ai                       124
communication             99
excel                     87
marketing                 54
research                  49
problem solving           37
sales                     37
sql                       36
hr                        30
rest                      29
operations    

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [21]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Combine the 'role_description' and 'description' columns
data['job_description'] = data['role_description'] + ' ' + data['description']

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['rest' 'proto' 'r' 'excel' 'hr' 'communication' 'multitasking'
 'microsoft office' 'ai' 'recruitment' 'marketing' 'social media'
 'content creation' 'sales' 'operations' 'product management' 'research'
 'problem solving' 'collaboration' 'market research' 'python'
 'analytical skills' 'organizational skills' 'inventory management' 'sql'
 'data visualization' 'b2b' 'java' 'javascript' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo' 'aws'
 'logistics' 'machine learning' 'tableau' 'power bi' 'react' 'agile'
 'email marketing' 'scrum' 'content strategy' 'tensorflow']
Counts per Class (Filtered):
r                        161
ai                       124
communication             99
excel                     87
marketing                 54
research                  49
problem solving           37
sales                     37
sql                       36
hr                        30
rest                      29
operations    

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [22]:
print("Shape of y_train:", y_train.shape)


Shape of y_train: (128, 45)


In [23]:
print("Unique values in y_train:", np.unique(y_train))


Unique values in y_train: [0 1]


In [24]:
class_counts_train = np.sum(y_train, axis=0)
print("Counts per class in y_train:")
print(class_counts_train)


Counts per class in y_train:
[  5  97  12   8   5  12  18  76   5   1   3   2  64  25   2  10   9   3
   3   6  17  44  10   1  20   7   4  27   7   0  11   7  16 128   8   4
  40  23  32   2   2  18  12   3   3]


In [25]:
unique_counts_train = np.unique(class_counts_train)
print("Unique counts of class labels in y_train:", unique_counts_train)


Unique counts of class labels in y_train: [  0   1   2   3   4   5   6   7   8   9  10  11  12  16  17  18  20  23
  25  27  32  40  44  64  76  97 128]


In [26]:
# Find indices of classes with at least one sample
indices = np.where(class_counts_train > 0)[0]

# Filter X_train and y_train
X_train_filtered = X_train[:, indices]
y_train_filtered = y_train[:, indices]

# Print the shape of filtered data
print("Shape of X_train_filtered:", X_train_filtered.shape)
print("Shape of y_train_filtered:", y_train_filtered.shape)


Shape of X_train_filtered: (128, 44)
Shape of y_train_filtered: (128, 44)


In [27]:
# Train a multi-label classifier on filtered data
model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the test set
y_pred_filtered = model_filtered.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [28]:
print("Unique values in y_train_filtered:", np.unique(y_train_filtered))


Unique values in y_train_filtered: [0 1]


In [29]:
from sklearn.linear_model import LogisticRegression

# Initialize a list to store individual models
models = []

# Train a logistic regression model for each label
for i in range(y_train_filtered.shape[1]):
    # Initialize a logistic regression model
    model = LogisticRegression(max_iter=1000)
    # Fit the model to the corresponding label column
    model.fit(X_train_filtered, y_train_filtered[:, i])
    # Append the trained model to the list
    models.append(model)

# Predict on the test set for each label
y_pred_filtered = np.array([model.predict(X_test) for model in models]).T

# Evaluate the model
f1 = f1_score(y_test, y_pred_filtered, average='micro')
hamming = hamming_loss(y_test, y_pred_filtered)
accuracy = accuracy_score(y_test, y_pred_filtered)

print("F1 Score (Micro):", f1)
print("Hamming Loss:", hamming)
print("Accuracy Score:", accuracy)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [30]:
print("Shape of y_train_filtered:", y_train_filtered.shape)
print("Sample of y_train_filtered:", y_train_filtered[:5])

Shape of y_train_filtered: (128, 44)
Sample of y_train_filtered: [[0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0
  0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
  0 0 0 0 0 0 0 0]
 [0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1
  0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
  0 1 0 0 0 0 0 0]]


In [31]:
# Check for samples with only zeros
zero_samples = np.where(~y_train_filtered.any(axis=1))[0]
print("Indices of samples with only zeros:", zero_samples)

# Check for columns with only zeros
zero_columns = np.where(~y_train_filtered.any(axis=0))[0]
print("Indices of columns with only zeros:", zero_columns)


Indices of samples with only zeros: []
Indices of columns with only zeros: []


In [32]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Function to combine role_description and description, handling empty values
def combine_descriptions(row):
    if row['role_description'] and row['description']:  # If both columns are non-empty
        return row['role_description'] + ' ' + row['description']
    elif row['role_description']:  # If only role_description is non-empty
        return row['role_description']
    elif row['description']:  # If only description is non-empty
        return row['description']
    else:  # If both are empty, handle as needed (e.g., assign a placeholder)
        return 'No job description available'

# Apply the function to combine descriptions
data['job_description'] = data.apply(combine_descriptions, axis=1)

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['rest' 'proto' 'r' 'excel' 'hr' 'communication' 'multitasking'
 'microsoft office' 'ai' 'recruitment' 'marketing' 'social media'
 'content creation' 'sales' 'operations' 'product management' 'research'
 'problem solving' 'collaboration' 'market research' 'python'
 'analytical skills' 'organizational skills' 'inventory management' 'sql'
 'data visualization' 'b2b' 'java' 'javascript' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo' 'aws'
 'logistics' 'machine learning' 'tableau' 'power bi' 'react' 'agile'
 'email marketing' 'scrum' 'content strategy' 'tensorflow']
Counts per Class (Filtered):
r                        161
ai                       124
communication             99
excel                     87
marketing                 54
research                  49
problem solving           37
sales                     37
sql                       36
hr                        30
rest                      29
operations    

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [33]:
# Shape of y_train_filtered
print("Shape of y_train_filtered:", y_train_filtered.shape)

# Unique values in y_train_filtered
unique_values = np.unique(y_train_filtered)
print("Unique values in y_train_filtered:", unique_values)

# Sample of y_train_filtered
print("Sample of y_train_filtered:", y_train_filtered[:5])

Shape of y_train_filtered: (128, 44)
Unique values in y_train_filtered: [0 1]
Sample of y_train_filtered: [[0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0
  0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
  0 0 0 0 0 0 0 0]
 [0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1
  0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1
  0 1 0 0 0 0 0 0]]


In [34]:
missing_class_columns = np.where((y_train_filtered.sum(axis=0) == 0) | (y_train_filtered.sum(axis=0) == len(y_train_filtered)))[0]
print("Indices of columns with only one class:", missing_class_columns)

Indices of columns with only one class: [32]


In [35]:
column_32_data = y_train_filtered[:, 32]
print("Sample of data in column 32 of y_train_filtered:", column_32_data[:5])

Sample of data in column 32 of y_train_filtered: [1 1 1 1 1]


In [36]:
# Remove column 32 from y_train_filtered
y_train_filtered = np.delete(y_train_filtered, 32, axis=1)

# Verify the new shape of y_train_filtered
print("Shape of y_train_filtered after removing column 32:", y_train_filtered.shape)

Shape of y_train_filtered after removing column 32: (128, 43)


In [37]:
# Ensure there are at least two unique classes present after removing the column
unique_classes_filtered = np.unique(y_train_filtered)
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing the column.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(pd.Series(y_train_filtered.flatten()).value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)

Unique Classes (Filtered): [0 1]
Counts per Class (Filtered):
0    4820
1     684
dtype: int64


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)
data['description'].fillna('', inplace=True)

# Function to combine role_description and description, handling empty values
def combine_descriptions(row):
    if row['role_description'] and row['description']:  # If both columns are non-empty
        return row['role_description'] + ' ' + row['description']
    elif row['role_description']:  # If only role_description is non-empty
        return row['role_description']
    elif row['description']:  # If only description is non-empty
        return row['description']
    else:  # If both are empty, handle as needed (e.g., assign a placeholder)
        return 'No job description available'

# Apply the function to combine descriptions
data['job_description'] = data.apply(combine_descriptions, axis=1)

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['job_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Ensure at least two classes in y_train and y_test
    y_train_valid = y_train[:, y_train.sum(axis=0) > 0]
    y_test_valid = y_test[:, y_train.sum(axis=0) > 0]

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train_valid)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test_valid, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test_valid, y_pred_filtered)
    accuracy = accuracy_score(y_test_valid, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Merge role_description and description columns to form job_description
df['job_description'] = df['role_description'].fillna('') + ' ' + df['description'].fillna('')

# Assume 'skills' column contains the required skills as a list (e.g., ["skill1", "skill2"])
# For example: df['skills'] = [["skill1", "skill2"], ["skill3"], ...]

# Convert skills to binary vector
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['skills'])

# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['job_description'], skills_encoded, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Define a custom dataset class
class JobDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = 512

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label, dtype=torch.float)}

# Create DataLoader objects
train_dataset = JobDataset(train_texts, train_labels)
val_dataset = JobDataset(val_texts, val_labels)
test_dataset = JobDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


ModuleNotFoundError: No module named 'torch'

In [4]:
pip install torch torchvision torchaudio







Collecting torch
  Downloading torch-2.3.0-cp310-cp310-win_amd64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.18.0-cp310-cp310-win_amd64.whl.metadata (6.6 kB)
Collecting torchaudio
  Downloading torchaudio-2.3.0-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12.1-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch)
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  Downloading intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch)
  D

In [5]:
pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Note: you may need to restart the kernel to use updated packages.




In [6]:
pip install transformers


Collecting transformersNote: you may need to restart the kernel to use updated packages.





  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     ---------------------------------------- 43.8/43.8 kB 2.1 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-none-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
    --------------------------------------- 0.2/9.1 MB 5.9 MB/s eta 0:00:02
   - -------------------------------------- 0.4/9.1 MB 5.0 MB/s eta 0:00:02
   -- ------------------------------------- 0.6/9.1 MB 4.8 MB/s eta 0:00:02
   --- --------------------------------

In [7]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.




In [8]:
pip install pandas


Note: you may need to restart the kernel to use updated packages.




In [9]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, recall_score, ndcg_score

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Merge role_description and description columns to form job_description
df['job_description'] = df['role_description'].fillna('') + ' ' + df['description'].fillna('')

# Convert skills to binary vector
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(df['skills'])

# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['job_description'], skills_encoded, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Define a custom dataset class
class JobDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = 512

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label, dtype=torch.float)}

# Create DataLoader objects
train_dataset = JobDataset(train_texts, train_labels)
val_dataset = JobDataset(val_texts, val_labels)
test_dataset = JobDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

class BERTXMLC(nn.Module):
    def __init__(self, num_labels):
        super(BERTXMLC, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.bottleneck = nn.Linear(768, 256)
        self.tanh = nn.Tanh()
        self.classifier = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        x = self.tanh(self.bottleneck(x))
        x = self.classifier(x)
        return x

num_labels = skills_encoded.shape[1]
model = BERTXMLC(num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = BCEWithLogitsLoss()

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss}')

# Evaluation
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(outputs.cpu().numpy())

# Convert predictions to binary
threshold = 0.5
binary_preds = (all_preds > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(all_labels, binary_preds)
recall_at_5 = recall_score(all_labels, binary_preds, average='samples', k=5)
ndcg_at_5 = ndcg_score(all_labels, binary_preds, k=5)

print(f'Accuracy: {accuracy}')
print(f'Recall@5: {recall_at_5}')
print(f'nDCG@5: {ndcg_at_5}')


NameError: name 'df' is not defined

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, recall_score, ndcg_score

# Load the dataset
data = pd.read_csv('C:/Users/hsahn/Downloads/job_details.csv')

# Merge role_description and description columns to form job_description
data['job_description'] = data['role_description'].fillna('') + ' ' + data['description'].fillna('')

# Here, assuming 'skills' column exists and is a list of skills. You might need to adjust this depending on your actual dataset format.
# Convert skills to binary vector
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(data['skills'].apply(lambda x: x.split(',')))  # Adjust the split method based on actual data format

# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(data['job_description'], skills_encoded, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Define a custom dataset class
class JobDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = 512

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label, dtype=torch.float)}

# Create DataLoader objects
train_dataset = JobDataset(train_texts, train_labels)
val_dataset = JobDataset(val_texts, val_labels)
test_dataset = JobDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

class BERTXMLC(nn.Module):
    def __init__(self, num_labels):
        super(BERTXMLC, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.bottleneck = nn.Linear(768, 256)
        self.tanh = nn.Tanh()
        self.classifier = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        x = self.tanh(self.bottleneck(x))
        x = self.classifier(x)
        return x

num_labels = skills_encoded.shape[1]
model = BERTXMLC(num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = BCEWithLogitsLoss()

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss}')

# Evaluation
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(outputs.cpu().numpy())

# Convert predictions to binary
threshold = 0.5
binary_preds = (all_preds > threshold).astype(int)

# Calculate metrics
accuracy = accuracy_score(all_labels, binary_preds)
recall_at_5 = recall_score(all_labels, binary_preds, average='samples')
ndcg_at_5 = ndcg_score(all_labels, binary_preds, k=5)

print(f'Accuracy: {accuracy}')
print(f'Recall@5: {recall_at_5}')
print(f'nDCG@5: {ndcg_at_5}')


KeyError: 'skills'

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from transformers import AdamW
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import accuracy_score, recall_score, ndcg_score

# Vectorized skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Load the dataset
data = pd.read_csv('C:/Users/hsahn/Downloads/job_details.csv')

# Merge role_description and description columns to form job_description
data['job_description'] = data['role_description'].fillna('') + ' ' + data['description'].fillna('')

# Convert skills to binary vector
mlb = MultiLabelBinarizer(classes=skills_list)
skills_encoded = mlb.fit_transform(your_skills_data)  # Replace your_skills_data with your actual skills data

# Split the data into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(data['job_description'], skills_encoded, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Define a custom dataset class
class JobDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_len = 512

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_len, truncation=True, padding='max_length')
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label, dtype=torch.float)}

# Create DataLoader objects
train_dataset = JobDataset(train_texts, train_labels)
val_dataset = JobDataset(val_texts, val_labels)
test_dataset = JobDataset(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

class BERTXMLC(nn.Module):
    def __init__(self, num_labels):
        super(BERTXMLC, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.bottleneck = nn.Linear(768, 256)
        self.tanh = nn.Tanh()
        self.classifier = nn.Linear(256, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls_output)
        x = self.tanh(self.bottleneck(x))
        x = self.classifier(x)
        return x

num_labels = len(skills_list)
model = BERTXMLC(num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = BCEWithLogitsLoss()

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss}')

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss}')

# Evaluation
model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(outputs.cpu().numpy())

# Convert predictions to binary
threshold = 0.5
binary_preds = (torch.sigmoid(torch.tensor(all_preds)) > threshold).int().cpu().numpy()

# Calculate metrics
accuracy = accuracy_score(all_labels, binary_preds)
recall_at_5 = recall_score(all_labels, binary_preds, average='samples')
ndcg_at_5 = ndcg_score(all_labels, binary_preds, k=5)

print(f'Accuracy: {accuracy}')
print(f'Recall@5: {recall_at_5}')
print(f'nDCG@5: {ndcg_at_5}')


NameError: name 'your_skills_data' is not defined

In [14]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings
data['role_description'].fillna('', inplace=True)

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=1000, lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['hr' 'ai' 'r' 'recruitment' 'marketing' 'social media' 'content creation'
 'excel' 'sales' 'product management' 'research' 'collaboration'
 'market research' 'proto' 'operations' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo'
 'communication' 'aws' 'logistics' 'sql' 'rest' 'problem solving'
 'data visualization' 'python' 'react' 'tableau' 'power bi' 'java'
 'javascript' 'b2b' 'machine learning' 'agile' 'content strategy'
 'organizational skills']
Counts per Class (Filtered):
r                        147
ai                       110
research                  44
marketing                 42
sales                     31
excel                     27
communication             27
operations                23
hr                        19
market research           19
social media              17
collaboration             16
rest                      15
sql                       14
business development      10
aws            

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings in the relevant column
data['role_description'].fillna('', inplace=True)

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class (skill)
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization with bigrams
    vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data
    model_filtered = MultiOutputClassifier(LogisticRegression(max_iter=1000))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['hr' 'ai' 'r' 'recruitment' 'marketing' 'social media' 'content creation'
 'excel' 'sales' 'product management' 'research' 'collaboration'
 'market research' 'proto' 'operations' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo'
 'communication' 'aws' 'logistics' 'sql' 'rest' 'problem solving'
 'data visualization' 'python' 'react' 'tableau' 'power bi' 'java'
 'javascript' 'b2b' 'machine learning' 'agile' 'content strategy'
 'organizational skills']
Counts per Class (Filtered):
r                        147
ai                       110
research                  44
marketing                 42
sales                     31
excel                     27
communication             27
operations                23
hr                        19
market research           19
social media              17
collaboration             16
rest                      15
sql                       14
business development      10
aws            

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [16]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings in the relevant column
data['role_description'].fillna('', inplace=True)

# Function for text cleaning
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class (skill)
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization with bigrams
    vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Train a multi-label classifier on filtered data using RandomForestClassifier
    model_filtered = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique Classes (Filtered): ['hr' 'ai' 'r' 'recruitment' 'marketing' 'social media' 'content creation'
 'excel' 'sales' 'product management' 'research' 'collaboration'
 'market research' 'proto' 'operations' 'product marketing'
 'project management' 'business development' 'lead generation' 'seo'
 'communication' 'aws' 'logistics' 'sql' 'rest' 'problem solving'
 'data visualization' 'python' 'react' 'tableau' 'power bi' 'java'
 'javascript' 'b2b' 'machine learning' 'agile' 'content strategy'
 'organizational skills']
Counts per Class (Filtered):
r                        147
ai                       110
research                  44
marketing                 42
sales                     31
excel                     27
communication             27
operations                23
hr                        19
market research           19
social media              17
collaboration             16
rest                      15
sql                       14
business development      10
aws            

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings in the relevant column
data['role_description'].fillna('', inplace=True)

# Function for text cleaning with lemmatization
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]  # Lemmatize and remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class (skill)
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization with trigrams
    vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 3), lowercase=False)
    X = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Define the model with RandomForestClassifier
    rf = RandomForestClassifier()

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform grid search
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Best model
    best_rf = grid_search.best_estimator_

    # Train a multi-label classifier on filtered data using the best RandomForestClassifier
    model_filtered = MultiOutputClassifier(best_rf)
    model_filtered.fit(X_train, y_train)

    # Predict on the test set
    y_pred_filtered = model_filtered.predict(X_test)

    # Evaluate the model
    f1 = f1_score(y_test, y_pred_filtered, average='micro')
    hamming = hamming_loss(y_test, y_pred_filtered)
    accuracy = accuracy_score(y_test, y_pred_filtered)

    print("F1 Score (Micro):", f1)
    print("Hamming Loss:", hamming)
    print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unique Classes (Filtered): ['hr' 'ai' 'r' 'recruitment' 'marketing' 'content creation' 'excel'
 'product management' 'research' 'collaboration' 'market research' 'proto'
 'product marketing' 'project management' 'business development'
 'lead generation' 'seo' 'communication' 'aws' 'logistics' 'sql' 'rest'
 'problem solving' 'data visualization' 'python' 'react' 'tableau'
 'power bi' 'java' 'javascript' 'b2b' 'machine learning' 'agile'
 'content strategy']
Counts per Class (Filtered):
r                       146
ai                      110
research                 44
marketing                42
communication            27
excel                    27
market research          19
hr                       19
collaboration            16
rest                     15
sql                      14
business development     10
aws                       8
project management        8
python                    7
content creation          6
recruitment               6
react                     5
product

In [18]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from transformers import BertTokenizer, BertModel
import torch

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings in the relevant column
data['role_description'].fillna('', inplace=True)

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class (skill)
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization with trigrams
    vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
    X_tfidf = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Load BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')

    # Function to extract BERT embeddings
    def get_bert_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        outputs = bert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        return embeddings

    # Apply BERT embeddings to the cleaned_text column
    X_bert = np.array([get_bert_embeddings(text) for text in data_filtered['cleaned_text']])
    X_bert = np.squeeze(X_bert)

    # Combine TF-IDF and BERT embeddings
    X_combined = np.hstack((X_tfidf.toarray(), X_bert))

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

    # Define the model with RandomForestClassifier
    rf = RandomForestClassifier()

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform grid search
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Best model
    best_rf = grid_search.best_estimator_

    # Train a multi-label classifier on filtered data using the best RandomForestClassifier
    model_rf = MultiOutputClassifier(best_rf)
    model_rf.fit(X_train, y_train)

    # Predict on the test set using RandomForestClassifier
    y_pred_rf = model_rf.predict(X_test)

    # Evaluate the RandomForestClassifier model
    f1_rf = f1_score(y_test, y_pred_rf, average='micro')
    hamming_rf = hamming_loss(y_test, y_pred_rf)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)

    print("RandomForestClassifier")
    print("F1 Score (Micro):", f1_rf)
    print("Hamming Loss:", hamming_rf)
    print("Accuracy Score:", accuracy_rf)

    # Train a multi-label classifier on filtered data using XGBoost
    xgb = XGBClassifier(objective='multi:softprob', eval_metric='mlogloss')
    model_xgb = MultiOutputClassifier(xgb)
    model_xgb.fit(X_train, y_train)

    # Predict on the test set using XGBoost
    y_pred_xgb = model_xgb.predict(X_test)

    # Evaluate the XGBoost model
    f1_xgb = f1_score(y_test, y_pred_xgb, average='micro')
    hamming_xgb = hamming_loss(y_test, y_pred_xgb)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

    print("XGBoost")
    print("F1 Score (Micro):", f1_xgb)
    print("Hamming Loss:", hamming_xgb)
    print("Accuracy Score:", accuracy_xgb)


ModuleNotFoundError: No module named 'xgboost'

In [20]:
pip install xgboost




Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 991.0 kB/s eta 0:01:41
   ---------------------------------------- 0.1/99.8 MB 787.7 kB/s eta 0:02:07
   ---------------------------------------- 0.1/99.8 MB 939.4 kB/s eta 0:01:47
   ---------------------------------------- 0.2/99.8 MB 706.2 kB/s eta 0:02:22
   ---------------------------------------- 0.2/99.8 MB 737.3 kB/s eta 0:02:16
   ---------------------------------------- 0.2/99.8 MB 737.3 kB/s eta 0:02:16
   ---------------------------------------- 0.3/99.8 MB 787.7 kB/s eta 0:02:07
   ---------------------------------------- 0.3/99.8 MB 776.2 kB/s eta 0:02:09
   --------

In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from transformers import BertTokenizer, BertModel
import torch

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Fill NaN values with empty strings in the relevant column
data['role_description'].fillna('', inplace=True)

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
    return ' '.join(tokens)  # Join tokens back into a single string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Count the number of samples per class (skill)
class_counts = data['skills'].explode().value_counts()

# Filter classes with at least two samples
valid_classes = class_counts[class_counts >= 2].index

# Filter data to keep only samples with valid classes
data_filtered = data[data['skills'].apply(lambda x: any(skill in valid_classes for skill in x))]

# Remove samples belonging to classes with only one sample
data_filtered = data_filtered[data_filtered['skills'].apply(lambda x: len(x) > 1)]

# Filter out outliers and delete them
outlier_classes = class_counts[class_counts < 2].index
data_filtered['skills'] = data_filtered['skills'].apply(lambda x: [skill for skill in x if skill not in outlier_classes])

# Reset index after deleting rows
data_filtered.reset_index(drop=True, inplace=True)

# Convert filtered data to features and labels
X_filtered = data_filtered['cleaned_text']
y_filtered = data_filtered.drop(columns=['cleaned_text'])

# Ensure there are at least two unique classes present after filtering again
unique_classes_filtered = data_filtered['skills'].explode().unique()
if len(unique_classes_filtered) < 2:
    print("There are not enough unique classes present in the filtered data after removing classes with fewer than two samples.")
else:
    # Print unique classes and their counts
    print("Unique Classes (Filtered):", unique_classes_filtered)
    print("Counts per Class (Filtered):")
    print(data_filtered['skills'].explode().value_counts())

    # Transform skills into binary labels
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(data_filtered['skills'])

    # TF-IDF Vectorization with trigrams
    vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
    X_tfidf = vectorizer.fit_transform(data_filtered['cleaned_text'])

    # Load BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')

    # Function to extract BERT embeddings
    def get_bert_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        outputs = bert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        return embeddings

    # Apply BERT embeddings to the cleaned_text column
    X_bert = np.array([get_bert_embeddings(text) for text in data_filtered['cleaned_text']])
    X_bert = np.squeeze(X_bert)

    # Combine TF-IDF and BERT embeddings
    X_combined = np.hstack((X_tfidf.toarray(), X_bert))

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

    # Define the model with RandomForestClassifier
    rf = RandomForestClassifier()

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform grid search
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Best model
    best_rf = grid_search.best_estimator_

    # Train a multi-label classifier on filtered data using the best RandomForestClassifier
    model_rf = MultiOutputClassifier(best_rf)
    model_rf.fit(X_train, y_train)

    # Predict on the test set using RandomForestClassifier
    y_pred_rf = model_rf.predict(X_test)

    # Evaluate the RandomForestClassifier model
    f1_rf = f1_score(y_test, y_pred_rf, average='micro')
    hamming_rf = hamming_loss(y_test, y_pred_rf)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)

    print("RandomForestClassifier")
    print("F1 Score (Micro):", f1_rf)
    print("Hamming Loss:", hamming_rf)
    print("Accuracy Score:", accuracy_rf)

    # Train a multi-label classifier on filtered data using XGBoost
    xgb = XGBClassifier(objective='multi:softprob', eval_metric='mlogloss')
    model_xgb = MultiOutputClassifier(xgb)
    model_xgb.fit(X_train, y_train)

    # Predict on the test set using XGBoost
    y_pred_xgb = model_xgb.predict(X_test)

    # Evaluate the XGBoost model
    f1_xgb = f1_score(y_test, y_pred_xgb, average='micro')
    hamming_xgb = hamming_loss(y_test, y_pred_xgb)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

    print("XGBoost")
    print("F1 Score (Micro):", f1_xgb)
    print("Hamming Loss:", hamming_xgb)
    print("Accuracy Score:", accuracy_xgb)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unique Classes (Filtered): ['hr' 'ai' 'r' 'content creation' 'excel' 'research' 'market research'
 'proto' 'seo' 'aws' 'sql' 'rest' 'python' 'react' 'tableau' 'power bi'
 'java' 'javascript' 'b2b']
Counts per Class (Filtered):
r                   136
ai                  110
research             44
excel                27
hr                   19
market research      19
rest                 15
sql                  14
python                7
aws                   7
content creation      6
proto                 5
react                 5
java                  4
javascript            4
b2b                   3
seo                   2
tableau               2
power bi              2
Name: skills, dtype: int64


model.safetensors:   2%|2         | 10.5M/440M [00:00<?, ?B/s]

Fitting 3 folds for each of 108 candidates, totalling 324 fits
RandomForestClassifier
F1 Score (Micro): 0.7283950617283951
Hamming Loss: 0.08270676691729323
Accuracy Score: 0.25


XGBoostError: value 0 for Parameter num_class should be greater equal to 1
num_class: Number of output class in the multi-class classification.

In [24]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from transformers import BertTokenizer, BertModel
import torch

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Apply BERT embeddings to the cleaned_text column
X_bert = np.array([get_bert_embeddings(text) for text in data['cleaned_text']])
X_bert = np.squeeze(X_bert)

# Combine TF-IDF and BERT embeddings
X_combined = np.hstack((X_tfidf.toarray(), X_bert))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

# Define the model with RandomForestClassifier
rf = RandomForestClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_

# Train a multi-label classifier on filtered data using the best RandomForestClassifier
model_rf = MultiOutputClassifier(best_rf)
model_rf.fit(X_train, y_train)

# Predict on the test set using RandomForestClassifier
y_pred_rf = model_rf.predict(X_test)

# Evaluate the RandomForestClassifier model
f1_rf = f1_score(y_test, y_pred_rf, average='micro')
hamming_rf = hamming_loss(y_test, y_pred_rf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("RandomForestClassifier")
print("F1 Score (Micro):", f1_rf)
print("Hamming Loss:", hamming_rf)
print("Accuracy Score:", accuracy_rf)

# Train a multi-label classifier on filtered data using XGBoost
xgb = XGBClassifier(objective='binary:logistic')
model_xgb = MultiOutputClassifier(xgb)
model_xgb.fit(X_train, y_train)

# Predict on the test set using XGBoost
y_pred_xgb = model_xgb.predict(X_test)

# Evaluate the XGBoost model
f1_xgb = f1_score(y_test, y_pred_xgb, average='micro')
hamming_xgb = hamming_loss(y_test, y_pred_xgb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("XGBoost")
print("F1 Score (Micro):", f1_xgb)
print("Hamming Loss:", hamming_xgb)
print("Accuracy Score:", accuracy_xgb)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 3 folds for each of 108 candidates, totalling 324 fits
RandomForestClassifier
F1 Score (Micro): 0.804733727810651
Hamming Loss: 0.040993788819875775
Accuracy Score: 0.4
XGBoost
F1 Score (Micro): 0.9060773480662984
Hamming Loss: 0.02111801242236025
Accuracy Score: 0.6285714285714286


In [25]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Define the XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

# Perform grid search with stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_xgb = grid_search.best_estimator_

# Train the best XGBoost model
best_xgb.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = best_xgb.predict(X_test)

# Evaluate the XGBoost model
f1_xgb = f1_score(y_test, y_pred_xgb, average='micro')
hamming_xgb = hamming_loss(y_test, y_pred_xgb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print("XGBoost (with Grid Search and Stratified K-Fold CV)")
print("F1 Score (Micro):", f1_xgb)
print("Hamming Loss:", hamming_xgb)
print("Accuracy Score:", accuracy_xgb)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 27 candidates, totalling 135 fits


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

In [26]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from xgboost import XGBClassifier

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Convert multilabel indicator matrix to separate binary classification tasks
binary_labels = np.zeros_like(labels)

for i in range(labels.shape[1]):
    col_labels = labels[:, i]
    unique_labels = np.unique(col_labels)
    if len(unique_labels) > 1:  # Skip columns with only one class
        binary_labels[:, i] = col_labels

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, binary_labels, test_size=0.2, random_state=42)

# Define the XGBoost classifier for binary classification
xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

# Perform grid search with stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model for binary classification
best_xgb = grid_search.best_estimator_

# Train the best XGBoost model for binary classification
best_xgb.fit(X_train, y_train)

# Predict probabilities for each label for binary classification
y_pred_proba = best_xgb.predict_proba(X_test)

# Convert probabilities to binary predictions
y_pred = (y_pred_proba > 0.5).astype(int)

# Evaluate the binary XGBoost model
f1_micro = f1_score(y_test, y_pred, average='micro')
hamming_loss_score = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Binary XGBoost (with Grid Search and Stratified K-Fold CV)")
print("F1 Score (Micro):", f1_micro)
print("Hamming Loss:", hamming_loss_score)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 27 candidates, totalling 135 fits


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

In [27]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from xgboost import XGBClassifier

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Define the XGBoost classifier
xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)

# Define the MultiOutputClassifier
multi_target_xgb = MultiOutputClassifier(xgb, n_jobs=-1)

# Fit the classifier
multi_target_xgb.fit(X_train, y_train)

# Predict the labels
y_pred = multi_target_xgb.predict(X_test)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro')
hamming_loss_score = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("XGBoost Multi-Label Classification")
print("F1 Score (Micro):", f1_micro)
print("Hamming Loss:", hamming_loss_score)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


XGBoost Multi-Label Classification
F1 Score (Micro): 0.88268156424581
Hamming Loss: 0.02608695652173913
Accuracy Score: 0.5142857142857142


In [28]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Define the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the MultiOutputClassifier
multi_target_rf = MultiOutputClassifier(rf, n_jobs=-1)

# Fit the classifier
multi_target_rf.fit(X_train, y_train)

# Predict the labels
y_pred = multi_target_rf.predict(X_test)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro')
hamming_loss_score = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Random Forest Multi-Label Classification")
print("F1 Score (Micro):", f1_micro)
print("Hamming Loss:", hamming_loss_score)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Random Forest Multi-Label Classification
F1 Score (Micro): 0.783132530120482
Hamming Loss: 0.04472049689440994
Accuracy Score: 0.4


In [29]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

# Function for text cleaning with stemming and lemmatization
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        stemmer = PorterStemmer()
        lemmatizer = WordNetLemmatizer()
        text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
        text = text.lower()  # Convert to lowercase
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stopwords.words('english')]  # Stem and lemmatize, remove stopwords
        return ' '.join(tokens)  # Join tokens back into a single string
    else:
        return ""  # Return an empty string if the input is not a string

# Apply preprocessing to the role_description column
data['cleaned_text'] = data['role_description'].apply(preprocess_text)

# Define the skills list
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]

# Function to extract skills from job descriptions
def extract_skills(text, skills_list):
    text = text.lower()
    extracted_skills = [skill for skill in skills_list if skill in text]
    return extracted_skills

# Apply skills extraction to the cleaned_text column
data['skills'] = data['cleaned_text'].apply(lambda x: extract_skills(x, skills_list))

# Convert skills into binary labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['skills'])

# TF-IDF Vectorization with trigrams
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), lowercase=False)
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)

# Define the Gradient Boosting classifier
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Define the MultiOutputClassifier
multi_target_gb = MultiOutputClassifier(gb, n_jobs=-1)

# Fit the classifier
multi_target_gb.fit(X_train, y_train)

# Predict the labels
y_pred = multi_target_gb.predict(X_test)

# Evaluate the model
f1_micro = f1_score(y_test, y_pred, average='micro')
hamming_loss_score = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Gradient Boosting Multi-Label Classification")
print("F1 Score (Micro):", f1_micro)
print("Hamming Loss:", hamming_loss_score)
print("Accuracy Score:", accuracy)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: y contains 1 class after sample_weight trimmed classes with zero weights, while a minimum of 2 classes are required.