## Step 1: Preprocessing the Profiles Dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('Demo Profiles.csv')

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in tokens]
    return ' '.join(stemmed)

df['processed_position'] = df['position'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['processed_position'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['industry'])

## Step 2: Split Data Into Training and Testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

## Step 3: Define Models

In [7]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

## Step 4: Define Hyperparameter Tuning

In [8]:
param_grid = {
    "Naive Bayes": {'alpha': [0.1, 1, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10]},
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    "Random Forest": {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
}

## Step 5: Train and Evaluate Models

In [9]:
best_models = {}
model_scores = {}

for name, model in models.items():
    grid = GridSearchCV(model, param_grid[name], cv=4)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    best_models[name] = best_model
    scores = cross_val_score(best_model, X_train, y_train, cv=4)
    avg_score = np.mean(scores)
    model_scores[name] = avg_score
    print(f"{name}: Best Params: {grid.best_params_}, Cross-Val Score: {avg_score}")

Naive Bayes: Best Params: {'alpha': 0.1}, Cross-Val Score: 0.7875
Logistic Regression: Best Params: {'C': 10}, Cross-Val Score: 0.85
SVM: Best Params: {'C': 10, 'kernel': 'linear'}, Cross-Val Score: 0.8750000000000001
Random Forest: Best Params: {'max_depth': None, 'n_estimators': 10}, Cross-Val Score: 0.85


## Step 6: Identify the Best Model

In [10]:
best_model_name = max(model_scores, key=model_scores.get)
print(f"The best model is: {best_model_name} with a Cross-Val Score of: {model_scores[best_model_name]}")

The best model is: SVM with a Cross-Val Score of: 0.8750000000000001


## Step 7: Evaluate the Best Model

In [11]:
y_pred = best_models[best_model_name].predict(X_test)
print(f"{best_model_name} Test Set Report:\n{classification_report(y_test, y_pred)}\n")

SVM Test Set Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         1
           3       0.33      1.00      0.50         1
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         3
           6       1.00      0.33      0.50         3
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         4

    accuracy                           0.90        20
   macro avg       0.93      0.93      0.89        20
weighted avg       0.97      0.90      0.90        20




## Step 8: Predict the industry category

In [16]:
digital_example = "Digital Marketing Specialist"

processed_example = preprocess_text(digital_example)

In [17]:
vectorized_example = tfidf_vectorizer.transform([processed_example])

In [18]:
predicted_category_index = best_models["SVM"].predict(vectorized_example)
predicted_category = label_encoder.inverse_transform(predicted_category_index)

print(f"The predicted industry for {digital_example} is: {predicted_category[0]}")

The predicted industry for Digital Marketing Specialist is: E-commerce
