In [1]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import re

In [2]:
# Load the data
data = pd.read_csv('NLP_project_dataset_updated.csv')
data

Unnamed: 0,Skills,Category
0,"JavaScript, Angular, Vue, CSS",Frontend Engineer
1,"Bootstrap, React, HTML, CSS",Frontend Engineer
2,"CSS, Bootstrap, JavaScript, React",Frontend Engineer
3,"JavaScript, HTML, Vue, Tailwind CSS",Frontend Engineer
4,"React, HTML, Vue, Tailwind CSS",Frontend Engineer
...,...,...
995,"REST API, SQL, Java, Maven",Java Developer
996,"JUnit, Hibernate, Spring, Maven",Java Developer
997,"Microservices, Spring, REST API, JUnit",Java Developer
998,"Spring, Hibernate, Microservices, SQL",Java Developer


In [3]:
# Assuming 'df' is your DataFrame and 'column_name' is the column
categories = data['Category'].unique()
print(f"Categories ({len(categories)}): {categories}")

Categories (10): ['Frontend Engineer' 'Backend Engineer' 'Full Stack Engineer'
 'Data Scientist' 'Machine Learning Engineer' 'DevOps Engineer'
 'Mobile App Developer' 'Cloud Engineer' 'QA Engineer' 'Java Developer']


In [4]:
data = pd.read_csv('NLP_project_dataset_updated.csv')

# Predefined Skills
skill_keywords = [
    'Agile', 'Analytics', 'Angular', 'AWS', 'Azure', 'Big Data', 'CSS',
    'Data Analysis', 'Data Science', 'Django', 'Docker', 'ETL', 'Excel',
    'Flask', 'Git', 'HTML', 'Java', 'JavaScript', 'JIRA', 'Keras', 'Linux',
    'Machine Learning', 'MATLAB', 'MongoDB', 'Node.js', 'PostgreSQL',
    'Power BI', 'Python', 'R', 'React', 'SAS', 'Scrum', 'Spark', 'SQL',
    'Tableau', 'TensorFlow','html','Terraform', 'AWS', 'Bash', 'CI/CD',
    'Bug Tracking', 'JUnit', 'Selenium', 'Manual Testing'
]

In [83]:
data

Unnamed: 0,Skills,Category
0,"JavaScript, Angular, Vue, CSS",Frontend Engineer
1,"Bootstrap, React, HTML, CSS",Frontend Engineer
2,"CSS, Bootstrap, JavaScript, React",Frontend Engineer
3,"JavaScript, HTML, Vue, Tailwind CSS",Frontend Engineer
4,"React, HTML, Vue, Tailwind CSS",Frontend Engineer
...,...,...
995,"REST API, SQL, Java, Maven",Java Developer
996,"JUnit, Hibernate, Spring, Maven",Java Developer
997,"Microservices, Spring, REST API, JUnit",Java Developer
998,"Spring, Hibernate, Microservices, SQL",Java Developer


In [5]:

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z0-9\s,]", "", str(text))  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text


In [6]:

# Apply preprocessing
data["Cleaned_skills"] = data["Skills"].apply(preprocess_text)


In [7]:
# Skill extraction function
def extract_skills(resume_text):
    return [skill for skill in skill_keywords if skill.lower() in resume_text]


In [8]:

# Apply skill extraction
data["Extracted_Skills"] = data["Cleaned_skills"].apply(extract_skills)
data['Extracted_Skills_Str'] = data['Extracted_Skills'].apply(lambda x: ' '.join(x))


In [9]:

# Encode target labels (job categories)
le = LabelEncoder()
data["Category_Encoded"] = le.fit_transform(data["Category"])


In [10]:

# Feature extraction with TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(data['Extracted_Skills_Str'])
y = data["Category_Encoded"]


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Hyper parameter tuning

Grid search

In [91]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score

# # Define the parameter grid for Random Forest
# param_grid = {
#     'n_estimators': [50, 100, 200],  # Number of trees
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of trees
#     'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
#     'min_samples_leaf': [1, 2, 4],  # Minimum samples at a leaf node
#     'bootstrap': [True, False],  # Whether to use bootstrap sampling
#     'max_features':['sqrt','log2']
# }

# # Initialize Random Forest Classifier
# rf = RandomForestClassifier()

# # Perform Grid Search
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
#                            cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# # Fit the model on training data
# grid_search.fit(X_train, y_train)

In [92]:
# print(grid_search.best_params_)
# print(grid_search.best_score_)


In [12]:

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100,criterion='entropy',max_features='log2',max_depth=None,min_samples_leaf=1,min_samples_split=5,bootstrap=True)
model.fit(X_train, y_train)

In [13]:
# Evaluate model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred)}")

Model Accuracy: 0.87


# Predicting single output

In [14]:
# Function for predicting a single output from input skills
def predict_category_from_skills(input_skills):
    # Convert the input list of skills to a single string
    skills_str = ' '.join(skill.lower() for skill in input_skills)
    
    # Transform using TF-IDF
    input_tfidf = tfidf.transform([skills_str])
    
    # Predict category
    predicted_category = model.predict(input_tfidf)
    return le.inverse_transform(predicted_category)[0]


In [19]:
import joblib

joblib.dump(model, "random_forest_model.pkl")
print("Model saved as random_forest_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")

Model saved as random_forest_model.pkl


['label_encoder.pkl']

In [16]:

# Example input
input_data = {
    "skills": [
    "Java",
    "Javascript",
    "Html",
    "Web development",
    "Vs code",
    "Github",
    "Postman",
    "Reactjs",
    "Nodejs",
    "Mongodb",
    "Expressjs",
    "Nextjs",
    "German speaking",
    "Public speaking",
    "Time management",
    "Discipline"
    ]
}

# Predict the category
predicted_category = predict_category_from_skills(input_data["skills"])
print(f"Predicted Category: {predicted_category}")

Predicted Category: Full Stack Engineer
