In [47]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import textstat
from scipy.stats import randint as sp_randint
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin


In [5]:
df = pd.read_csv('Train_rev1.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244768 entries, 0 to 244767
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  244768 non-null  int64 
 1   Title               244767 non-null  object
 2   FullDescription     244768 non-null  object
 3   LocationRaw         244768 non-null  object
 4   LocationNormalized  244768 non-null  object
 5   ContractType        65442 non-null   object
 6   ContractTime        180863 non-null  object
 7   Company             212338 non-null  object
 8   Category            244768 non-null  object
 9   SalaryRaw           244768 non-null  object
 10  SalaryNormalized    244768 non-null  int64 
 11  SourceName          244767 non-null  object
dtypes: int64(2), object(10)
memory usage: 22.4+ MB


In [6]:
df_sample = df.sample(2500, random_state=42)

In [8]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)

salary_threshold = np.percentile(df_sample['SalaryNormalized'], 75)

df_sample['SalaryLabel'] = (df_sample['SalaryNormalized'] >= salary_threshold).astype(int)

In [11]:
X = df_sample['FullDescription']
y = df_sample['SalaryLabel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### basemodel

In [13]:
model = make_pipeline(TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000), MultinomialNB())

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', conf_matrix)

Accuracy: 0.758
Confusion Matrix:
 [[365   6]
 [115  14]]


# model Refine

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

### apply random search

In [35]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', MultinomialNB()),
])

# Define the parameter space for Random Search
parameters = {
    'tfidf__max_df': uniform(0.5, 0.5),  # Uniform distribution between 0.5 and 1.0
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],  # Test unigrams, bigrams, or both
    'tfidf__min_df': [1, 2, 3, 4, 5],
    'clf__alpha': uniform(0.001, 0.1),  # Uniform distribution between 0.001 and 0.101
}

# Set up Random Search
random_search = RandomizedSearchCV(pipeline, parameters, n_iter=50, cv=5, random_state=42, n_jobs=-1)

# Fit Random Search to the data
random_search.fit(X_train, y_train)

In [36]:
best_model = random_search.best_estimator_
y_pred_rs = best_model.predict(X_test)

In [37]:
print(random_search.best_estimator_)
print('----------------------------------------------------------------------------------------------------------')
print(random_search.best_params_)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.8005575058716043, min_df=3,
                                 ngram_range=(1, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('clf', MultinomialNB(alpha=0.08761761457749352))])
----------------------------------------------------------------------------------------------------------
{'clf__alpha': 0.08761761457749352, 'tfidf__max_df': 0

In [38]:
rs_accuracy = accuracy_score(y_test, y_pred)
print(f'New Accuracy: {rs_accuracy}')

rs_conf_matrix = confusion_matrix(y_test, y_pred)
print('New Confusion Matrix:\n', rs_conf_matrix)

New Accuracy: 0.77
New Confusion Matrix:
 [[322  49]
 [ 66  63]]


### apply ensamble with random forest

In [30]:
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('clf', RandomForestClassifier(random_state=42)),
])

# Parameter space to explore
param_dist = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'clf__n_estimators': sp_randint(100, 500),
    'clf__max_depth': [None] + list(sp_randint(3, 10).rvs(3)),
    'clf__min_samples_split': sp_randint(2, 11),
    'clf__min_samples_leaf': sp_randint(1, 11),
    'clf__bootstrap': [True, False]
}

n_iter_search = 20

random_search = RandomizedSearchCV(pipeline_rf, param_distributions=param_dist, 
                                   n_iter=n_iter_search, cv=5, random_state=42)

random_search.fit(X_train, y_train)

In [31]:
# Predict with the best found parameters
y_pred_rf_tuned = random_search.predict(X_test)

# Calculate accuracy
accuracy_rf_tuned = accuracy_score(y_test, y_pred_rf_tuned)
print(f'RandomForest Tuned Accuracy: {accuracy_rf_tuned}')

conf_matrix_rf_tuned = confusion_matrix(y_test, y_pred_rf_tuned)
print("Confusion Matrix for the Tuned RandomForest Model:")
print(conf_matrix_rf_tuned)

RandomForest Tuned Accuracy: 0.764
Confusion Matrix for the Tuned RandomForest Model:
[[367   4]
 [114  15]]


In [39]:
# Assuming y_pred_nb is available
y_pred_ensemble = (y_pred_rf_tuned + y_pred_rs) / 2
y_pred_ensemble = np.round(y_pred_ensemble).astype(int)

# Calculate ensemble accuracy
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f'Ensemble Accuracy: {accuracy_ensemble}')

Ensemble Accuracy: 0.764


### Apply Feature Engineering and test the best model we get above

In [45]:
def combine_text_columns(data_frame, to_combine=['FullDescription', 'Title', 'Company']):
    return data_frame[to_combine].apply(lambda x: ' '.join(x.dropna()), axis=1)

text_pipeline_nb = Pipeline([
    ('text_preprocessor', FunctionTransformer(lambda x: combine_text_columns(x, ['FullDescription', 'Title', 'Company']), validate=False)), 
    ('tfidf', TfidfVectorizer(stop_words=stopwords.words('english'), 
                              max_df=0.8005575058716043, 
                              min_df=3, 
                              ngram_range=(1, 2))),
    ('classifier', MultinomialNB(alpha=0.08761761457749352))
])

In [46]:
X_text_train = X_train[['FullDescription', 'Title', 'Company']]  
X_text_test = X_test[['FullDescription', 'Title', 'Company']]

# Train the model on text data
text_pipeline_nb.fit(X_text_train, y_train)

# Evaluate the model
y_pred_text_nb = text_pipeline_nb.predict(X_text_test)
accuracy_text_nb = accuracy_score(y_test, y_pred_text_nb)
conf_matrix_text_nb = confusion_matrix(y_test, y_pred_text_nb)

print(f'Text-based NB Model Accuracy: {accuracy_text_nb}')
print('Text-based NB Model Confusion Matrix:\n', conf_matrix_text_nb)

Text-based NB Model Accuracy: 0.782
Text-based NB Model Confusion Matrix:
 [[325  46]
 [ 63  66]]


In [55]:
model_params = text_pipeline_nb.get_params()

for param_name, param_value in model_params.items():
    print(f"{param_name}: {param_value}")

memory: None
steps: [('text_preprocessor', FunctionTransformer(func=<function <lambda> at 0x00000296A085B2E0>)), ('tfidf', TfidfVectorizer(max_df=0.8005575058716043, min_df=3, ngram_range=(1, 2),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])), ('classifier', MultinomialNB(alpha=0.08761761457749352))]
verbose: False
text_preprocessor: FunctionTransformer(func=<function <lambda> at 0x00000296A085B2E0>)
tfidf: TfidfVectorizer(max_df=0.8005575058716043, min_df=3, ngram_range=(1, 2),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", 

In [53]:
tfidf_vect = text_pipeline_nb.named_steps['tfidf']

clf = text_pipeline_nb.named_steps['classifier']


In [54]:
feature_names = tfidf_vect.get_feature_names_out()

log_prob = clf.feature_log_prob_

top_n = 10

# Class 0 (Low Salary), Class 1 (High Salary)
for i, class_label in enumerate(['Low Salary', 'High Salary']):
    top_features_indices = log_prob[i].argsort()[-top_n:][::-1]
    top_features = [feature_names[j] for j in top_features_indices]
    
    print(f"Top 10 indicative words for {class_label}:")
    for feature in top_features:
        print(feature)
    print("\n")


Top 10 indicative words for Low Salary:
sales
experience
manager
work
role
team
business
skills
working
care


Top 10 indicative words for High Salary:
business
experience
project
manager
development
management
team
senior
role
client


