In [None]:
# Rebuild CombinedText to include JobTitle, JobDescription, and KeyDuties (but not SearchKeywords)
df['CombinedText'] = (
    df['JobTitle'].fillna('') + ' ' +
    df['JobDescription'].fillna('') + ' ' +
    df['KeyDuties'].fillna('')
)


# Remove 'DataBuyerScore' and rebuild the metadata features
meta_with_industry = df[['AgencySize', 'Industry', 'IsSeniorRole']]

# Recreate the column transformer without DataBuyerScore
meta_transformer_with_industry = ColumnTransformer(transformers=[
    ('onehot_agency', OneHotEncoder(handle_unknown='ignore'), ['AgencySize']),
    ('onehot_industry', OneHotEncoder(handle_unknown='ignore'), ['Industry']),
    ('scale_senior', StandardScaler(), ['IsSeniorRole'])
])

# Transform metadata
X_meta_with_industry = meta_transformer_with_industry.fit_transform(meta_with_industry)

# Vectorize text with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=5000)
X_text = vectorizer.fit_transform(df['CombinedText'].fillna(''))

# Combine text and metadata
X_combined_with_industry = hstack([X_text, X_meta_with_industry])
y = df['IsLikelyDataBuyer']

# Apply SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_combined_with_industry, y)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_resampled, y_resampled)

# Evaluate on full set
y_pred = model.predict(X_combined_with_industry)
y_pred_proba = model.predict_proba(X_combined_with_industry)[:, 1]
classification_report_result = classification_report(y, y_pred, output_dict=True)


In [None]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from scipy.sparse import hstack

# Create the pipeline without SMOTE (SMOTE will be applied separately)
pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(transformers=[
        ('onehot_agency', OneHotEncoder(handle_unknown='ignore'), ['AgencySize']),
        ('onehot_industry', OneHotEncoder(handle_unknown='ignore'), ['Industry']),
        ('scale_senior', StandardScaler(), ['IsSeniorRole']),
        ('text_vectorizer', TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=5000), 'CombinedText')
    ])),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Assume df contains all your training data (with features: CombinedText, AgencySize, Industry, IsSeniorRole, and target: IsLikelyDataBuyer)
X = df[['CombinedText', 'AgencySize', 'Industry', 'IsSeniorRole']]
y = df['IsLikelyDataBuyer']

# Apply preprocessing and then SMOTE
X_transformed = pipeline.named_steps['preprocessor'].fit_transform(X)

# Apply SMOTE to the transformed data (oversample minority class)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_transformed, y)

# Now train the model on the resampled data
pipeline.named_steps['classifier'].fit(X_resampled, y_resampled)

# Save the pipeline
joblib.dump(pipeline, 'nlp_pipeline_with_smote.joblib')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Create a combined text field
df['text'] = df['JobTitle'].fillna('') + ' ' + df['JobDescription'].fillna('') + ' ' + df['SearchKeywords'].fillna('')

# Split into two groups
buyers = df[df['IsLikelyDataBuyer'] == 1]['text']
non_buyers = df[df['IsLikelyDataBuyer'] == 0]['text']

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_buyers = vectorizer.fit_transform(buyers)
X_non_buyers = vectorizer.transform(non_buyers)

# Compute average TF-IDF per term
buyer_mean = np.asarray(X_buyers.mean(axis=0)).ravel()
non_buyer_mean = np.asarray(X_non_buyers.mean(axis=0)).ravel()
terms = vectorizer.get_feature_names_out()

# Compute score difference
diff = buyer_mean - non_buyer_mean

# Get top positive (data buyer-associated) terms
top_indices = diff.argsort()[::-1][:30]
top_keywords = pd.DataFrame({
    'Keyword': terms[top_indices],
    'ScoreDifference': diff[top_indices]
})

In [None]:
import joblib

# Save model and vectorizer separately
joblib.dump(model, 'nlp_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')