In [1]:
pip install pycaret



In [2]:
from pycaret.classification import *
import pandas as pd

In [3]:
df = pd.read_csv('/content/NqndMEyZakuimmFI.xlsx')
df.head()
df.describe()
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [4]:
embeddings_df = pd.read_csv('/content/embeddings.csv')
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.104674,-0.009502,0.033692,-0.061758,-0.0259,0.032283,-0.052686,0.012121,-0.031696,-0.078312,...,0.018982,0.058401,-0.018068,-0.071319,-0.04337,0.044209,-0.047659,-0.046638,0.030361,-0.002944
1,-0.096968,-0.061156,0.03274,-0.014845,-0.136236,0.039445,0.031455,0.033295,-0.030588,-0.07752,...,-0.065498,0.018353,0.114014,0.008116,0.040426,-0.008272,-0.123749,-0.042179,0.01309,-0.00374
2,-0.072214,-0.047441,-0.068538,0.059089,0.088164,-0.048801,0.022871,0.077242,-0.046167,0.037751,...,0.069768,0.048513,0.055176,-0.024959,-0.008842,0.151043,0.001052,-0.118857,0.059079,-0.004182
3,-0.068159,0.050683,0.02476,-0.030802,-0.054053,0.003545,-0.026478,0.024037,-0.013642,-0.05377,...,-0.021017,0.045145,-0.024822,-0.05691,0.165516,0.012954,-0.077619,-0.03213,-0.008404,0.069118
4,-0.017301,-0.011286,0.014259,0.024312,0.015674,-0.062632,0.031688,0.055835,-0.067045,-0.034193,...,0.028637,-0.052495,0.030603,0.050783,0.008792,0.05794,0.043163,-0.052096,0.046847,-0.004541


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# Select categorical features
categorical_features = [
    'telecommuting', 'has_company_logo', 'has_questions',
    'employment_type', 'required_experience', 'required_education',
    'industry', 'function', 'department', 'location'
]

# One-Hot Encode binary/nominal features
onehot_features = ['telecommuting', 'has_company_logo', 'has_questions']
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(df[onehot_features])
onehot_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(onehot_features))

# Label Encode ordinal/multi-class features
label_features = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'department', 'location']
label_df = df[label_features].apply(lambda col: LabelEncoder().fit_transform(col.astype(str)))

# Combine all encoded features
encoded_df = pd.concat([onehot_df, label_df], axis=1)

# Save the encoded features
encoded_df.to_csv("encoded_features_latest.csv", index=False)
encoded_df.to_pickle("encoded_features_latest.pkl")
np.save("encoded_features_latest.npy", encoded_df.values)

print("Categorical features encoded and saved successfully.")


Categorical features encoded and saved successfully.


In [6]:
encodings_df = pd.read_csv('/content/encoded_features_latest.csv')
encodings_df.head()

Unnamed: 0,telecommuting_0,telecommuting_1,has_company_logo_0,has_company_logo_1,has_questions_0,has_questions_1,employment_type,required_experience,required_education,industry,function,department,location
0,1.0,0.0,0.0,1.0,1.0,0.0,1,2,9,38,7,1149,2633
1,1.0,0.0,0.0,1.0,1.0,0.0,1,2,4,118,7,1149,2519
2,1.0,0.0,0.0,1.0,0.0,1.0,1,5,13,57,37,785,933
3,1.0,0.0,0.0,1.0,1.0,0.0,1,7,13,10,31,1149,2083
4,1.0,0.0,0.0,1.0,1.0,0.0,1,5,1,60,22,653,1294


In [7]:
df.drop(columns=['job_id'], inplace=True)


In [8]:
df.columns

Index(['title', 'location', 'department', 'salary_range', 'company_profile',
       'description', 'requirements', 'benefits', 'telecommuting',
       'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [9]:
df.drop(columns = categorical_features + label_features, inplace=True)

In [10]:
df.columns

Index(['title', 'salary_range', 'company_profile', 'description',
       'requirements', 'benefits', 'fraudulent'],
      dtype='object')

In [11]:
columns_to_drop = [
    'title',
    'company_profile',
    'description',
    'requirements',
    'benefits'
]

df = df.drop(columns=columns_to_drop, errors='ignore')  # `errors='ignore'` avoids crash if some cols not present


In [12]:
df.columns

Index(['salary_range', 'fraudulent'], dtype='object')

In [13]:
df['salary_range_missing'] = df['salary_range'].isnull().astype(int)


In [14]:
df.columns

Index(['salary_range', 'fraudulent', 'salary_range_missing'], dtype='object')

In [15]:
df = df.drop(columns=['salary_range'])

In [16]:
df.columns

Index(['fraudulent', 'salary_range_missing'], dtype='object')

In [17]:
combined_df = pd.concat([df, embeddings_df, encoded_df], axis=1)

In [18]:
combined_df.columns

Index(['fraudulent', 'salary_range_missing', '0', '1', '2', '3', '4', '5', '6',
       '7',
       ...
       'has_company_logo_1', 'has_questions_0', 'has_questions_1',
       'employment_type', 'required_experience', 'required_education',
       'industry', 'function', 'department', 'location'],
      dtype='object', length=399)

In [19]:
df.columns

Index(['fraudulent', 'salary_range_missing'], dtype='object')

In [20]:
encoded_df.columns

Index(['telecommuting_0', 'telecommuting_1', 'has_company_logo_0',
       'has_company_logo_1', 'has_questions_0', 'has_questions_1',
       'employment_type', 'required_experience', 'required_education',
       'industry', 'function', 'department', 'location'],
      dtype='object')

In [None]:
clf = setup(data=combined_df, target='fraudulent',normalize=True,remove_multicollinearity=True,polynomial_features=True,fix_imbalance=True)


In [None]:


# Step 2: Compare models using F1 score
best_model = compare_models(sort='F1')

# Step 3: Tune the best model for better performance
tuned_model = tune_model(best_model, optimize='F1')

# Step 4: Optionally blend and stack models
blended = blend_models(estimator_list=[best_model, tuned_model], optimize='F1')
stacked = stack_models(estimator_list=[best_model, tuned_model], meta_model=best_model)

# Step 5: Evaluate the tuned model
evaluate_model(tuned_model)



In [None]:

# Step 6: Save the final model
save_model(tuned_model, 'final_fraud_detection_model')

print("✅ Model training and evaluation complete. Final model saved as 'final_fraud_detection_model.pkl'.")

In [None]:

from pycaret.classification import load_model, predict_model
import pandas as pd

# Load the saved model
model = load_model('final_fraud_detection_model')

# Load your test data
test_df = pd.read_csv('/content//0tkf3jUGLYjCEJGz.csv')


In [None]:

predictions = predict_model(model, data=test_df)

# View predictions
print(predictions.head())


In [None]:

predictions.to_csv('test_predictions.csv', index=False)


In [None]:

from pycaret.classification import interpret_model
interpret_model(model)


In [None]:
from pycaret.classification import load_model, plot_model

# Load the trained model
model = load_model('final_fraud_detection_model')

# Confusion Matrix
plot_model(model, plot='confusion_matrix')

# Precision-Recall Curve
plot_model(model, plot='pr')


# Feature Importance
plot_model(model, plot='feature')


In [None]:

print(model.get_params())


In [None]:
pip install sweetviz ydata-profiling autoviz


In [None]:

import sweetviz as sv
import pandas as pd

# Load your dataset
df = pd.read_csv("NqndMEyZakuimmFI.csv")

# Generate and save the report
report = sv.analyze(df)
report.show_html("sweetviz_report.html")


In [None]:

from ydata_profiling import ProfileReport
import pandas as pd

df = pd.read_csv("NqndMEyZakuimmFI.csv")
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_file("profiling_report.html")


In [None]:

from autoviz.AutoViz_Class import AutoViz_Class
import pandas as pd

AV = AutoViz_Class()
df = AV.AutoViz("NqndMEyZakuimmFI.csv")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

# Load the dataset
df = pd.read_csv("NqndMEyZakuimmFI.csv")

# Set your target column
target_column = 'fraudulent'

# Clean text columns to remove problematic characters
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).apply(lambda x: re.sub(r'[$]', '', x))

# Create a directory to save plots
os.makedirs("visualizations", exist_ok=True)

# Generate plots
for column in df.columns:
    if column != target_column:
        plt.figure(figsize=(10, 6))
        try:
            if df[column].dtype == 'object':
                sns.countplot(data=df, x=column, hue=target_column)
            else:
                sns.boxplot(data=df, x=target_column, y=column)
            plt.title(f'{column} vs {target_column}')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'visualizations/{column}_vs_{target_column}.png')
            plt.close()
        except Exception as e:
            print(f"Skipping {column} due to error: {e}")


In [None]:
import pandas as pd

df = pd.read_csv('/content/NqndMEyZakuimmFI.xlsx')

In [None]:
df.columns

In [None]:
df.describe()

In [3]:
import pandas as pd

# Load the original Excel file
file_path = '/content/embeddings.csv'
df = pd.read_csv(file_path)

# Get the total number of rows
total_rows = len(df)

# Calculate size for each part
part_size = total_rows // 3

# Split the DataFrame into 3 parts
df1 = df.iloc[:part_size]
df2 = df.iloc[part_size:2*part_size]
df3 = df.iloc[2*part_size:]

# Save to 3 separate Excel files
df1.to_excel('embeddings_part1.xlsx', index=False)
df2.to_excel('embeddings_part2.xlsx', index=False)
df3.to_excel('embeddings_part3.xlsx', index=False)


In [4]:
# Read the 3 parts
df1 = pd.read_excel('embeddings_part1.xlsx')
df2 = pd.read_excel('embeddings_part2.xlsx')
df3 = pd.read_excel('embeddings_part3.xlsx')

# Concatenate them
merged_df = pd.concat([df1, df2, df3], ignore_index=True)

df == merged_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14299,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
14300,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
14301,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
14302,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [5]:
df.equals(merged_df)


False

In [6]:
print("Original shape:", df.shape)
print("Merged shape:  ", merged_df.shape)


Original shape: (14304, 384)
Merged shape:   (14304, 384)


In [7]:
print(df.index.equals(merged_df.index))  # Should be True


True


In [8]:
print(list(df.columns))
print(list(merged_df.columns))


['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '15

In [9]:
# Use this to compare numerically (allows small tolerance)
import numpy as np

np.allclose(df.select_dtypes(include='number'),
            merged_df.select_dtypes(include='number'),
            equal_nan=True)


True

In [10]:
df_reset = df.reset_index(drop=True)
merged_reset = merged_df.reset_index(drop=True)

# Reorder columns if needed
merged_reset = merged_reset[df_reset.columns]

# Final comparison
print(df_reset.equals(merged_reset))


False


In [11]:
df = pd.read_csv('/content/test_predictions.csv')

In [12]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,prediction_label
0,16996,EXCELLENT ER RN Opportunity Available Now,"US, IL, Urbana",,,,"Our client, located in Urban, IL, is looking f...",,,0,1,0,Full-time,,,Hospital & Health Care,Health Care Provider,1
1,9358,Scrum Master / Website Development Project Man...,"US, FL, Tampa",,,352 Inc. is a full-service digital agency crea...,Other agencies may call this job “Project Mana...,Qualifications2-10 years of experience in webs...,What You’ll GetFreedom: We trust you to do you...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Project Management,0
2,11562,HR Assistant - Contract,"AU, NSW, Sydney",People & Culture,,Squiz is one of the world's leading web soluti...,Squiz is an Australian owned and now multinati...,You could be a graduate or have many years of ...,,0,1,0,Contract,Not Applicable,Unspecified,Information Technology and Services,Human Resources,0
3,1106,Regional Sales Director South Africa,"ZA, GT, Johannesburg",Sales,,Upstream’s mission is to revolutionise the way...,The Regional Sales Director SA will help deriv...,Knowledge/Skills/ExperienceProven sales and ac...,"Includes attractive competitive base salary, c...",0,1,1,Full-time,Director,,Telecommunications,Sales,0
4,1981,Petrophysicist,"US, OK, Oklahoma City",,,Valor Services provides Workforce Solutions th...,About the CompanyThis is an amazing job opport...,"Education: Bachelor’s degree in Geology, Geoph...","Industry, Location and CompensationIndustry: E...",0,1,0,Full-time,Associate,Bachelor's Degree,Oil & Energy,,0


In [13]:

pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [14]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [15]:

model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
text_data = (
    df['title'].fillna('') + ' ' +
    df['company_profile'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['requirements'].fillna('') + ' ' +
    df['benefits'].fillna('')
)

# Generate embeddings
embeddings = model.encode(text_data.tolist())

# Convert to DataFrame if needed
embedding_df = pd.DataFrame(embeddings)