In [7]:
import pandas as pd
import spacy

# Preprocesamiento

In [8]:
# Cargar el dataset
df = pd.read_csv('fake_job_postings.csv')
df = df.drop(columns=['salary_range', 'department','benefits', 'telecommuting', 'has_company_logo', 'has_questions','industry'])
df.fillna('', inplace=True)

In [9]:
#Normalizar la educacion requerida
df['required_education'] = df['required_education'].fillna('Unspecified')
df.loc[df['required_education'] == 'High School or equivalent', 'required_education'] = 'High School or less'
df.loc[df['required_education'] == 'Some High School Coursework', 'required_education'] = 'High School or less'
df.loc[df['required_education'] == 'Vocational - HS Diploma', 'required_education'] = 'High School or less'
df.loc[df['required_education'] == 'Vocational - Degree', 'required_education'] = 'Bachelor\'s Degree'

df.head()

Unnamed: 0,job_id,title,location,company_profile,description,requirements,employment_type,required_experience,required_education,function,fraudulent
0,1,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Other,Internship,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,Full-time,Not Applicable,,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Full-time,Mid-Senior level,Bachelor's Degree,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full-time,Mid-Senior level,Bachelor's Degree,Health Care Provider,0


In [10]:
df['description'] = df['description'] + ' ' + df['requirements'] + ' ' + df['required_education']
df = df.drop(columns=['requirements', 'required_education'])

In [11]:
# Selección de variables categóricas y numéricas
categorical_features = ['title', 'location', 'company_profile', 'description', 'employment_type', 'required_experience', 'function']


In [12]:
# Inicializar SpaCy y descargar el modelo de lenguaje
nlp = spacy.load('en_core_web_sm')

In [13]:
def preprocess_text(text):
    doc = nlp(text.lower())  # Convertir a minúsculas y procesar con SpaCy
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]  # Lematización, eliminación de stopwords y caracteres no alfabéticos
    return ' '.join(tokens)

# Aplicar preprocesamiento a las variables categóricas
for feature in categorical_features:
    df[feature] = df[feature].apply(preprocess_text)

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# One-Hot Encoding de variables categóricas
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Combinar las características codificadas con las numéricas
X = encoded_df
y = df['fraudulent']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier

# Entrenar el modelo de árbol de decisión
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluar el modelo
y_pred = clf.predict(X_test)
print("DecisionTreeClassifier Report:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


DecisionTreeClassifier Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       0.96      0.69      0.80       181

    accuracy                           0.98      3576
   macro avg       0.97      0.84      0.90      3576
weighted avg       0.98      0.98      0.98      3576

[[3390    5]
 [  57  124]]


In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from joblib import dump, load

# One-Hot Encoding de variables categóricas
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Combinar las características codificadas con las numéricas
X = encoded_df
y = df['fraudulent']

# Dividir el dataset en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo de árbol de decisión
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluar el modelo
y_pred = clf.predict(X_test)

# Guardar el modelo y el codificador
dump(clf, 'clf.joblib')
dump(encoder, 'encoder.joblib')

print("DecisionTreeClassifier Report:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

DecisionTreeClassifier Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3395
           1       0.96      0.69      0.80       181

    accuracy                           0.98      3576
   macro avg       0.97      0.84      0.90      3576
weighted avg       0.98      0.98      0.98      3576

[[3390    5]
 [  57  124]]


In [None]:
# Función para predecir nuevas entradas
def predict_new_entry(title, location, company_profile, description, employment_type, required_experience, function):
    new_data = {
        'title': [title],
        'location': [location],
        'company_profile': [company_profile],
        'description': [description],
        'employment_type': [employment_type],
        'required_experience': [required_experience],
        'function': [function]
    }
    new_df = pd.DataFrame(new_data)

    # Aplicar preprocesamiento
    for feature in categorical_features:
        new_df[feature] = new_df[feature].apply(preprocess_text)

    # One-Hot Encoding
    encoded_new_features = encoder.transform(new_df[categorical_features])
    encoded_new_df = pd.DataFrame(encoded_new_features, columns=encoder.get_feature_names_out(categorical_features))

    # Predecir con el modelo entrenado
    prediction = clf.predict(encoded_new_df)
    return prediction[0]

# Solicitar entrada del usuario
title = input("Ingrese el título: ")
location = input("Ingrese la ubicación: ")
company_profile = input("Ingrese el perfil de la compañía: ")
description = input("Ingrese la descripción: ")
employment_type = input("Ingrese el tipo de empleo: ")
required_experience = input("Ingrese la experiencia requerida: ")
function = input("Ingrese la función: ")

# Realizar predicción
result = predict_new_entry(title, location, company_profile, description, employment_type, required_experience, function)
print("Predicción (0: No Fraudulento, 1: Fraudulento):", result)

In [None]:
from joblib import load

# Cargar el modelo preentrenado
clf = load('./clf.joblib')

# Función para predecir nuevas entradas
def predict_new_entry(title, location, company_profile, description, employment_type, required_experience, function):
    new_data = {
        'title': [title],
        'location': [location],
        'company_profile': [company_profile],
        'description': [description],
        'employment_type': [employment_type],
        'required_experience': [required_experience],
        'function': [function]
    }
    new_df = pd.DataFrame(new_data)

    # Aplicar preprocesamiento
    for feature in categorical_features:
        new_df[feature] = new_df[feature].apply(preprocess_text)

    # One-Hot Encoding
    encoded_new_features = encoder.transform(new_df[categorical_features])
    encoded_new_df = pd.DataFrame(encoded_new_features, columns=encoder.get_feature_names_out(categorical_features))

    # Predecir con el modelo entrenado
    prediction = clf.predict(encoded_new_df)
    return prediction[0]

# Solicitar entrada del usuario
title = input("Ingrese el título: ")
location = input("Ingrese la ubicación: ")
company_profile = input("Ingrese el perfil de la compañía: ")
description = input("Ingrese la descripción: ")
employment_type = input("Ingrese el tipo de empleo: ")
required_experience = input("Ingrese la experiencia requerida: ")
function = input("Ingrese la función: ")

# Realizar predicción
result = predict_new_entry(title, location, company_profile, description, employment_type, required_experience, function)
print("Predicción (0: No Fraudulento, 1: Fraudulento):", result)