In [1]:
import pandas as pd
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")


In [2]:
job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}



In [3]:
skills_list = [
    "python", "java", "kotlin", "jetpack compose", "android sdk", "firebase",
    "rest", "json", "proto", "sql", "javascript", "cloud computing", "aws",
    "excel", "data visualization", "react", "node.js", "marketing", "social media",
    "seo", "content creation", "product management", "sales", "business development",
    "hr", "research", "operations", "analytical skills", "problem solving",
    "communication", "collaboration", "organizational skills", "multitasking",
    "microsoft office", "ai", "machine learning", "big data", "deep learning",
    "neural networks", "statistical analysis", "pandas", "numpy", "scikit-learn",
    "tensorflow", "keras", "r", "sas", "sql", "tableau", "power bi",
    "lead generation", "b2b", "b2c", "market research", "product marketing",
    "email marketing", "content strategy", "creative writing", "employee engagement",
    "talent management", "recruitment", "project management", "agile", "scrum",
    "supply chain management", "logistics", "procurement", "inventory management"
]


In [4]:

text_fields = ["role_description", "requirement", "description"]


In [5]:

if len(text_fields) > 1:
    data["combined_text"] = data[text_fields].fillna("").apply(lambda x: " ".join(x), axis=1)
    text_field = "combined_text"  # Use the combined field for further processing
else:
    text_field = text_fields[0]  # Use the single chosen field


In [6]:
import nltk
nltk.download('stopwords')

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [word for word in text.split() if word not in stopwords]
    return " ".join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
data["cleaned_text"] = data[text_field].apply(clean_text)  # Apply cleaning (optional)


In [8]:
# function to filter keywords based on the skill list
def filter_keywords(text):
    tokens = text.split()
    return [word.lower() for word in tokens if word.lower() in skills_list]

# Extract keywords
data["keywords"] = data["cleaned_text"].apply(filter_keywords)


In [9]:
# Function to create one-hot encoded feature vectors
def create_one_hot_vector(keywords, vocabulary):
    vector = [0] * len(vocabulary)
    for keyword in keywords:
        if keyword in vocabulary:
            vector[vocabulary.index(keyword)] = 1
    return vector

# Create vocabulary
vocabulary = list(set([word for skill in skills_list for word in skill.split(" ")]))

# Create one-hot encoded feature vectors
data["features"] = data["keywords"].apply(lambda x: create_one_hot_vector(x, vocabulary))


In [10]:
def assign_domain(job_title):
    for domain, titles in job_domains.items():
        if job_title in titles:
            return domain
    return "Other"

# Assign domains to each job
data["domain"] = data["role_title"].apply(assign_domain)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

feature_df = pd.DataFrame(data["features"].to_list())

X_train, X_test, y_train, y_test = train_test_split(feature_df, data["domain"], test_size=0.2, random_state=42)

# Classification (Naive Bayes example)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


Accuracy: 1.0
[[35]]


In [12]:
# Classify new job details (example)
new_text = """Human Resource Intern Currently pursuing a degree in Human Resources Business Administration or a related field Strong interpersonal and communication skills Ability to maintain confidentiality and handle sensitive information with discretion Detail oriented with excellent organizational and multitasking abilities Proficient in Microsoft Office Suite Word Excel PowerPoint Enthusiastic and eager to learn about various aspects of human resources Currently pursuing a degree in Human Resources Business Administration or a related field Strong interpersonal and communication skills Ability to maintain confidentiality and handle sensitive information with discretion Detail oriented with excellent organizational and multitasking abilities Proficient in Microsoft Office Suite Word Excel PowerPoint Enthusiastic and eager to learn about various aspects of human resources Internship Job Description Human Resource Intern Internship Job Description Human Resource Intern About the Role About the Role As a Human Resource Intern at Nexus Grove you will have the unique opportunity to gain hands on experience in the dynamic field of human resources You will work closely with our HR team to support various aspects of the recruitment and employee lifecycle processes This internship is designed to provide exposure to the diverse facets of HR allowing you to develop key skills and knowledge that will lay a solid foundation for a successful career in human resources Key Responsibilities Key Responsibilities Assist in the recruitment process by sourcing and screening candidates conducting initial interviews and coordinating interview schedules Support the onboarding process for new hires ensuring a smooth transition into the organization Maintain accurate and up to date employee records including documentation related to personnel changes promotions and terminations Collaborate with the HR team to create and update HR policies and procedures Handle administrative tasks such as preparing HR related documents managing calendars and responding to internal and external inquiries Assist in the recruitment process by sourcing and screening candidates conducting initial interviews and coordinating interview schedules Support the onboarding process for new hires ensuring a smooth transition into the organization Maintain accurate and up to date employee records including documentation related to personnel changes promotions and terminations Collaborate with the HR team to create and update HR policies and procedures Handle administrative tasks such as preparing HR related documents managing calendars and responding to internal and external inquiries Benefits Benefits Gain practical experience in a professional HR environment Mentorship opportunities with seasoned HR professionals Exposure to diverse HR functions providing a comprehensive understanding of the field Networking opportunities within the HR industry Gain practical experience in a professional HR environment Mentorship opportunities with seasoned HR professionals Exposure to diverse HR functions providing a comprehensive understanding of the field Networking opportunities within the HR industry.
"""
new_vector = create_one_hot_vector(filter_keywords(clean_text(new_text)), vocabulary)
predicted_domain = classifier.predict([new_vector])[0]
print("Predicted domain for new job:", predicted_domain)


Predicted domain for new job: Other


In [13]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") # Replace "your_dataset.csv" with the path to your CSV file

text_field = "role_description"  # Assuming "role_description" contains the job descriptions

# Download NLTK stopwords corpus
nltk.download('stopwords')

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [word for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(clean_text)

# Define job domains
job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# Assign domains based on job descriptions and keywords
def assign_domain(text):
    text = text.lower()
    for domain, keywords in job_domains.items():
        if any(keyword in text for keyword in keywords):
            return domain
    return "Other"

# Assign domains based on job descriptions
data["domain"] = data["cleaned_text"].apply(assign_domain)

X_train, X_test, y_train, y_test = train_test_split(data["cleaned_text"], data["domain"], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

predictions = classifier.predict(X_test_vec)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'float' object has no attribute 'lower'

In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") # Replace "your_dataset.csv" with the path to your CSV file

text_field = "role_description"  # Assuming "role_description" contains the job descriptions

nltk.download('stopwords')

def clean_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [word for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(clean_text)

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}


def assign_domain(text):
    text = text.lower()
    for domain, keywords in job_domains.items():
        if any(keyword in text for keyword in keywords):
            return domain
    return "Other"

data["domain"] = data["cleaned_text"].apply(assign_domain)

X_train, X_test, y_train, y_test = train_test_split(data["cleaned_text"], data["domain"], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

predictions = classifier.predict(X_test_vec)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") # Replace "your_dataset.csv" with the path to your CSV file

text_field = "role_description"  # Assuming "role_description" contains the job descriptions

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize PorterStemmer for stemming
stemmer = PorterStemmer()

# function for text cleaning with stemming and modified stopwords handling
def clean_text(text):
    if pd.isnull(text):  # Check if the text is NaN
        return ""
    text = text.lower()
    words = nltk.word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(clean_text)

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, keywords in job_domains.items():
        if any(keyword in text for keyword in keywords):
            return domain
    return "Other"

data["domain"] = data["cleaned_text"].apply(assign_domain)

X_train, X_test, y_train, y_test = train_test_split(data["cleaned_text"], data["domain"], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))  # Using unigrams and bigrams
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

predictions = classifier.predict(X_test_vec)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


In [None]:
import pandas as pd
import nltk
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") # Replace "your_dataset.csv" with the path to your CSV file

text_field = "role_description"  # Assuming "role_description" contains the job descriptions

nltk.download('punkt')

# Load pre-trained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}


def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

# function to calculate word count and sentence length
def extract_features(text):
    words = nltk.word_tokenize(text)
    word_count = len(words)
    sentence_length = len(nltk.sent_tokenize(text))
    return word_count, sentence_length

# feature extraction
data["word_count"], data["sentence_length"] = zip(*data[text_field].apply(extract_features))

# Concatenate Word2Vec embeddings for each word in the job description
def get_word_embeddings(text):
    words = nltk.word_tokenize(text)
    embeddings = []
    for word in words:
        if word in word2vec_model:
            embeddings.append(word2vec_model[word])
    return embeddings

# Apply Word2Vec embeddings
data["word_embeddings"] = data[text_field].apply(get_word_embeddings)

# Flatten the list of word embeddings into a single feature vector
data["word_embeddings"] = data["word_embeddings"].apply(lambda x: [item for sublist in x for item in sublist])

# Prepare features and target
X = data[["word_count", "sentence_length", "word_embeddings"]]
y = data["domain"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[["word_count", "sentence_length"]])
X_test_scaled = scaler.transform(X_test[["word_count", "sentence_length"]])

# Combine scaled numerical features with Word2Vec embeddings
X_train_final = [list(X_train_scaled[i]) + X_train["word_embeddings"].iloc[i] for i in range(len(X_train_scaled))]
X_test_final = [list(X_test_scaled[i]) + X_test["word_embeddings"].iloc[i] for i in range(len(X_test_scaled))]

classifier = MultinomialNB()
classifier.fit(X_train_final, y_train)

predictions = classifier.predict(X_test_final)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


In [None]:
pip install gensim


In [None]:
import pandas as pd
import nltk
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 

text_field = "role_description" 

nltk.download('punkt')

# Load pre-trained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}


def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def extract_features(text):
    words = nltk.word_tokenize(text)
    word_count = len(words)
    sentence_length = len(nltk.sent_tokenize(text))
    return word_count, sentence_length

data["word_count"], data["sentence_length"] = zip(*data[text_field].apply(extract_features))

# Concatenate Word2Vec embeddings for each word in the job description
def get_word_embeddings(text):
    words = nltk.word_tokenize(text)
    embeddings = []
    for word in words:
        if word in word2vec_model:
            embeddings.append(word2vec_model[word])
    return embeddings

# Apply Word2Vec embeddings
data["word_embeddings"] = data[text_field].apply(get_word_embeddings)

# Flatten the list of word embeddings into a single feature vector
data["word_embeddings"] = data["word_embeddings"].apply(lambda x: [item for sublist in x for item in sublist])

# Prepare features and target
X = data[["word_count", "sentence_length", "word_embeddings"]]
y = data["domain"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[["word_count", "sentence_length"]])
X_test_scaled = scaler.transform(X_test[["word_count", "sentence_length"]])

X_train_final = [list(X_train_scaled[i]) + X_train["word_embeddings"].iloc[i] for i in range(len(X_train_scaled))]
X_test_final = [list(X_test_scaled[i]) + X_test["word_embeddings"].iloc[i] for i in range(len(X_test_scaled))]

classifier = MultinomialNB()
classifier.fit(X_train_final, y_train)

predictions = classifier.predict(X_test_final)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
print(confusion_matrix(y_test, predictions))


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

nltk.download('punkt')

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [word for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(clean_text)

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000)
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

svm_classifier = SVC()
rf_classifier = RandomForestClassifier()
gbm_classifier = GradientBoostingClassifier()

svm_classifier.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
gbm_classifier.fit(X_train, y_train)

svm_predictions = svm_classifier.predict(X_test)
rf_predictions = rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)

print("Support Vector Machine Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 

text_field = "role_description"  

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(clean_text)

data = data.dropna(subset=['cleaned_text'])

X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier()

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rf_classifier = grid_search.best_estimator_

rf_predictions = best_rf_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the model and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 
text_field = "role_description"  

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(x) if isinstance(x, str) else "")

data = data[data["cleaned_text"].str.strip() != ""]

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf_classifier = grid_search_rf.best_estimator_

gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")  # Replace with your dataset path

text_field = "role_description"  # Assuming "role_description" contains the job descriptions

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(x) if isinstance(x, str) else "")

data = data[data["cleaned_text"].str.strip() != ""]

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf_classifier = grid_search_rf.best_estimator_

gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


In [None]:
pip install imbalanced-learn

In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")  

text_field = "role_description"  

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(x) if isinstance(x, str) else "")

data = data[data["cleaned_text"].str.strip() != ""]

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf_classifier = grid_search_rf.best_estimator_

gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')

In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")  

text_field = "role_description"  

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

data = data[data["cleaned_text"].str.strip() != ""]

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf_classifier = grid_search_rf.best_estimator_

gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 

text_field = "role_description"  

data.dropna(subset=[text_field], inplace=True)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

data["domain"] = data[text_field].apply(assign_domain)

def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

data = data[data["cleaned_text"].str.strip() != ""]

X = data["cleaned_text"]
y = data["domain"]

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handling class imbalance using SMOTE
# Adjust k_neighbors to be smaller or use SMOTE with k_neighbors=1 to avoid errors with very small classes
smote = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_rf_classifier = grid_search_rf.best_estimator_

gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


# Model Details-

Data Preparation
Loading Data: The data is loaded from a CSV file containing job details.
Dropping Missing Values: Rows with missing values in the 'role_description' column are removed to ensure data quality.
NLTK Resources: NLTK resources like stopwords, punkt, and wordnet are downloaded for text processing tasks.
Lemmatizer Initialization: A WordNetLemmatizer is initialized for reducing words to their base form.

Domain Assignment
Job Domains Dictionary: A dictionary maps job domains to associated keywords.
Assign Domain Function: This function assigns a domain to each job description by checking if any of the keywords in the job domains dictionary appear in the text.

Text Cleaning
Clean Text Function: This function cleans the text by:
Converting it to lowercase.
Removing non-alphanumeric characters.
Removing stopwords.
Lemmatizing the remaining words.
Applying Text Cleaning: The cleaning function is applied to the 'role_description' column to create a new 'cleaned_text' column.

Feature Extraction and Class Balancing
Vectorization: The cleaned text data is vectorized using TF-IDF with bi-grams and a maximum of 1000 features.
Handling Class Imbalance: RandomOverSampler is used to oversample the minority classes to balance the dataset.

Train-Test Split
The resampled data is split into training and testing sets with an 80-20 ratio.

Model Training and Hyperparameter Tuning
Random Forest Classifier:
Hyperparameter Tuning: GridSearchCV is used to find the best hyperparameters for the RandomForestClassifier.
Parameters Tuned: Number of estimators, maximum depth, and minimum samples split.
Gradient Boosting Classifier: Trained with default parameters.
Support Vector Machine (SVM): Trained with default parameters.

Predictions and Evaluation
Making Predictions: Each classifier makes predictions on the test set.
Evaluating Performance:
Accuracy: The accuracy score is computed for each classifier.
Confusion Matrix: The confusion matrix is printed for each classifier to understand the distribution of true positives, true negatives, false positives, and false negatives.


Explanation of Models
Random Forest Classifier:
    Model: An ensemble learning method that builds multiple decision trees and merges them to get a more accurate and stable prediction.
    Hyperparameters Tuned:
        n_estimators: Number of trees in the forest.
        max_depth: Maximum depth of the tree.
        min_samples_split: Minimum number of samples required to split an internal node.
Gradient Boosting Classifier:
    Model: An ensemble technique that builds models sequentially, each new model correcting errors made by the previous ones.
    Hyperparameters: Used default parameters.
Support Vector Machine (SVM):
    Model: A supervised learning model that finds the hyperplane that best separates the classes in the feature space.
    Hyperparameters: Used default parameters.

In [14]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
import joblib

data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv") 

text_field = "role_description"  

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Best Random Forest model
best_rf_classifier = grid_search_rf.best_estimator_

# Train other classifiers without hyperparameter tuning
gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

# predictions
rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

# model performance
rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest Accuracy: 0.9945054945054945
Gradient Boosting Machine Accuracy: 0.978021978021978
Support Vector Machine Accuracy: 0.9945054945054945
Confusion Matrix for Random Forest:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Gradient Boosting Machine:
[[17  0  0  1  2  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Support Vector Machine:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0

['tfidf_vectorizer.joblib']

# Model Performance -

Based on the accuracy scores and confusion matrices provided for the Random Forest, Gradient Boosting Machine (GBM), and Support Vector Machine (SVM) classifiers, we can make several observations about their performance.

Accuracy
    Random Forest Accuracy: 99.45%
    Gradient Boosting Machine Accuracy: 97.80%
    Support Vector Machine Accuracy: 99.45%
    
The Random Forest and SVM models have the same high accuracy, both outperforming the GBM model slightly.

In [1]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from nltk.stem import WordNetLemmatizer
import joblib

# Load dataset
data = pd.read_csv("C:/Users/hsahn/Downloads/job_details.csv")

text_field = "role_description"

# Drop rows with missing values in the 'role_description' column
data.dropna(subset=[text_field], inplace=True)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

job_domains = {
    "Software Development": [
        "android", "backend", "full stack", "node.js", "python", "web developer",
        "elixir", "phoenix", "sde", "react native", "software developer", "java", 
        "kotlin", "jetpack compose", "sdk", "firebase"
    ],
    "Data Science": [
        "data analyst", "data scientist", "big data", "machine learning",
        "data analytics", "prompt engineer", "mlops", "data analysis",
        "ai", "artificial intelligence", "statistical modeling", "deep learning"
    ],
    "Marketing": [
        "marketing", "brand marketing", "digital marketing", "social media",
        "influencer", "content creation", "seo", "email marketing",
        "product marketing", "advertising", "market research"
    ],
    "Human Resources": [
        "human resource", "hr", "recruitment", "talent acquisition",
        "comp & benefits", "employee relations", "training", "development"
    ],
    "Sales": [
        "sales", "business development", "inside sales", "account manager",
        "lead generation", "sales executive", "territory manager"
    ],
    "Operations": [
        "operations", "business operations", "supply chain", "logistics",
        "inventory management", "procurement", "project management"
    ],
    "Research": [
        "research", "insights", "data analysis", "market research",
        "academic research", "clinical research", "r&d"
    ],
    "Product Management": [
        "product management", "product solution", "product architect",
        "product owner", "product strategy", "product development"
    ],
    "Engineering": [
        "robotics", "unity", "climate", "ai", "automotive",
        "mechanical", "steering", "suspension", "brakes",
        "civil engineering", "electrical engineering", "chemical engineering"
    ]
}

# assign job domains based on skills
def assign_domain(text):
    text = text.lower()
    for domain, skills in job_domains.items():
        if any(skill in text for skill in skills):
            return domain
    return "Other"

# domains based on job descriptions
data["domain"] = data[text_field].apply(assign_domain)

# text cleaning
def clean_text(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.lower()
    text = "".join([char for char in text if char.isalnum() or char in " "])
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(words)

data["cleaned_text"] = data[text_field].apply(lambda x: clean_text(str(x)))

# Drop rows with empty cleaned_text
data = data[data["cleaned_text"].str.strip() != ""]

# features and target
X = data["cleaned_text"]
y = data["domain"]

# Vectorize text data using bi-grams
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance using RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_vec, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# classifiers
rf_classifier = RandomForestClassifier(random_state=42)
gbm_classifier = GradientBoostingClassifier(random_state=42)
svm_classifier = SVC(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_classifier, param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

# Best Random Forest model
best_rf_classifier = grid_search_rf.best_estimator_

# Train other classifiers without hyperparameter tuning
gbm_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

# predictions
rf_predictions = best_rf_classifier.predict(X_test)
gbm_predictions = gbm_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)

# model performance
rf_accuracy = accuracy_score(y_test, rf_predictions)
gbm_accuracy = accuracy_score(y_test, gbm_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Machine Accuracy:", gbm_accuracy)
print("Support Vector Machine Accuracy:", svm_accuracy)

print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test, rf_predictions))

print("Confusion Matrix for Gradient Boosting Machine:")
print(confusion_matrix(y_test, gbm_predictions))

print("Confusion Matrix for Support Vector Machine:")
print(confusion_matrix(y_test, svm_predictions))

# Save the trained models and vectorizer
joblib.dump(best_rf_classifier, 'best_random_forest_classifier.joblib')
joblib.dump(gbm_classifier, 'gbm_classifier.joblib')
joblib.dump(svm_classifier, 'svm_classifier.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

# Load the models and vectorizer (for future use)
# best_rf_classifier = joblib.load('best_random_forest_classifier.joblib')
# gbm_classifier = joblib.load('gbm_classifier.joblib')
# svm_classifier = joblib.load('svm_classifier.joblib')
# vectorizer = joblib.load('tfidf_vectorizer.joblib')

# Predict domains for the original dataset
data["predicted_domain"] = best_rf_classifier.predict(vectorizer.transform(data["cleaned_text"]))

# Save the dataset with predictions to a new CSV file
data.to_csv("C:/Users/hsahn/Downloads/job_details_with_predictions.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hsahn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest Accuracy: 0.9945054945054945
Gradient Boosting Machine Accuracy: 0.978021978021978
Support Vector Machine Accuracy: 0.9945054945054945
Confusion Matrix for Random Forest:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Gradient Boosting Machine:
[[17  0  0  1  2  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0]
 [ 0  0  0  0  0 19  0  0  0]
 [ 0  0  0  0  0  0 20  0  0]
 [ 0  0  0  0  0  0  0 17  0]
 [ 0  0  0  0  0  0  0  0 20]]
Confusion Matrix for Support Vector Machine:
[[20  0  0  0  0  0  0  0  1]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  0 32  0  0  0  0  0  0]
 [ 0  0  0 16  0  0  0  0  0