## Data Preparation

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan, isnull
from pyspark.sql.types import IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("IndeedJobPostings") \
    .config("spark.driver.memory", "14g") \
    .config("spark.executor.memory", "14g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.sql.autoBroadcastJoinThreshold", "10485760") \
    .getOrCreate()

# Assume the data is loaded into a DataFrame called 'df'
# If you need to load the data, uncomment and modify the following line:
df = spark.read.csv("fake_job_postings.csv", header=True, inferSchema=True)

# 1. Encode the label column and delete invalid data
df = df.withColumn("fraudulent", col("fraudulent").cast(IntegerType()))
df = df.filter((col("fraudulent") == 0) | (col("fraudulent") == 1))

# 2. List missing values per attribute and their percentages
def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull() | isnan(col(k))).count()
        percentage = (nullRows / numRows) * 100
        null_columns_counts.append((k, nullRows, percentage))
    return null_columns_counts

null_columns_calc = null_value_calc(df)

print("Null values per attribute:")
for item in null_columns_calc:
    print(f"{item[0]}: {item[1]} ({item[2]:.2f}%)")

# 3. Delete attributes with more than 1% null values
columns_to_drop = [item[0] for item in null_columns_calc if item[2] > 1]
df = df.drop(*columns_to_drop)

print(f"\nColumns dropped due to >1% null values: {columns_to_drop}")

# Show the resulting schema
print("\nResulting DataFrame schema:")
df.printSchema()

# Show the first few rows of the prepared dataset
print("\nFirst few rows of the prepared dataset:")
df.show(5, truncate=False)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/23 23:17:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/23 23:17:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

Null values per attribute:
job_id: 0 (0.00%)
title: 0 (0.00%)
location: 337 (1.99%)
department: 11039 (65.07%)
salary_range: 14258 (84.04%)
company_profile: 3206 (18.90%)
description: 0 (0.00%)
requirements: 2571 (15.15%)
benefits: 6949 (40.96%)
telecommuting: 0 (0.00%)
has_company_logo: 0 (0.00%)
has_questions: 0 (0.00%)
employment_type: 3273 (19.29%)
required_experience: 6675 (39.34%)
required_education: 7661 (45.16%)
industry: 4667 (27.51%)
function: 6158 (36.30%)
fraudulent: 0 (0.00%)

Columns dropped due to >1% null values: ['location', 'department', 'salary_range', 'company_profile', 'requirements', 'benefits', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']

Resulting DataFrame schema:
root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = tr

#### Verification code (to be added after the main data preparation code)
1. Encoding the label column and deleting invalid data:
   - The `groupBy("fraudulent").count().show()` command will display the count of rows for each unique value in the "fraudulent" column. We should only see two rows: one for 0 and one for 1. If there are any other values or if the column contains non-integer types, the task wasn't completed correctly.

2. Listing missing values per attribute and their percentages:
   - The `null_value_calc(df)` function calculates this, and the results are printed for each column. We can review these percentages to confirm they've been calculated correctly.

3. Deleting attributes with more than 1% null values:
   - The code prints the final list of columns in the DataFrame. We can compare this with the original list to see which columns were removed.
   - Additionally, it recalculates the null value percentages for the remaining columns. We should see that no column has more than 1% null values. If any do, it will print a warning.

4. Showing the first few rows of the prepared dataset:
   - This allows us to visually inspect the data and confirm that the "fraudulent" column only contains 0 or 1, and that the problematic columns have been removed.

In [2]:
# 1. Verify encoding of label column and deletion of invalid data
print("\nVerification 1: Encoding of label column and deletion of invalid data")
print("Unique values in 'fraudulent' column:")
df.groupBy("fraudulent").count().show()

# 2. Verify listing of missing values
print("\nVerification 2: Listing of missing values")
null_columns_calc = null_value_calc(df)
for item in null_columns_calc:
    print(f"{item[0]}: {item[1]} ({item[2]:.2f}%)")

# 3. Verify deletion of attributes with >1% null values
print("\nVerification 3: Deletion of attributes with >1% null values")
print("Columns in the final DataFrame:")
print(df.columns)

print("\nConfirming no columns have >1% null values:")
new_null_calc = null_value_calc(df)
for item in new_null_calc:
    if item[2] > 1:
        print(f"Warning: {item[0]} still has {item[2]:.2f}% null values")
    else:
        print(f"{item[0]}: {item[2]:.2f}% null values (OK)")

# 4. Show the first few rows of the final prepared dataset
print("\nFirst few rows of the final prepared dataset:")
df.show(5, truncate=False)


Verification 1: Encoding of label column and deletion of invalid data
Unique values in 'fraudulent' column:
+----------+-----+
|fraudulent|count|
+----------+-----+
|         1|  886|
|         0|16080|
+----------+-----+


Verification 2: Listing of missing values
job_id: 0 (0.00%)
title: 0 (0.00%)
description: 0 (0.00%)
telecommuting: 0 (0.00%)
has_company_logo: 0 (0.00%)
has_questions: 0 (0.00%)
fraudulent: 0 (0.00%)

Verification 3: Deletion of attributes with >1% null values
Columns in the final DataFrame:
['job_id', 'title', 'description', 'telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']

Confirming no columns have >1% null values:
job_id: 0.00% null values (OK)
title: 0.00% null values (OK)
description: 0.00% null values (OK)
telecommuting: 0.00% null values (OK)
has_company_logo: 0.00% null values (OK)
has_questions: 0.00% null values (OK)
fraudulent: 0.00% null values (OK)

First few rows of the final prepared dataset:
+------+------------------------------

### Clean the datasets: remove anything that is not a letter, remove multiple spaces, lower case everything.

In [3]:
from pyspark.sql.functions import col, when, count, isnan, isnull, regexp_replace, lower, trim


# 4. Clean the text columns: remove non-letter characters, multiple spaces, and convert to lowercase
text_columns = [
    "title", "location", "department", "company_profile", "description", 
    "requirements", "benefits", "employment_type", "required_experience", 
    "required_education", "industry", "function"
]

for column in text_columns:
    if column in df.columns:
        df = df.withColumn(
            column,
            lower(
                trim(
                    regexp_replace(col(column), "[^a-zA-Z\\s]", "")
                )
            )
        )
        df = df.withColumn(column, regexp_replace(col(column), "\\s+", " "))

# Show the resulting schema
print("\nResulting DataFrame schema:")
df.printSchema()

# Show the first few rows of the cleaned dataset
print("\nFirst few rows of the cleaned dataset:")
df.show(5, truncate=False)



Resulting DataFrame schema:
root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- fraudulent: integer (nullable = true)


First few rows of the cleaned dataset:
+------+---------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Verification of data cleaning

Check the output of the verification steps. 

1. We should see zero counts for non-letter characters, multiple spaces, and uppercase letters in all columns.

2. Examine the sample of cleaned data. All text should be lowercase, with only single spaces between words, and no special characters or numbers.

3. Look at the column data types. All text columns should be of type 'string'.
Review the summary statistics. This will show us the minimum, maximum, and average lengths of the text in each column, which can help identify any unexpected values.

In [4]:
from pyspark.sql.functions import regexp_extract,length

# Get the list of columns that actually exist in the DataFrame
existing_columns = [col for col in text_columns if col in df.columns]

print(f"\nColumns present in the DataFrame: {existing_columns}")

# 1. Check for non-letter characters
def check_non_letter(column_name):
    return df.filter(regexp_extract(col(column_name), "[^a-z\\s]", 0) != "").count()

# 2. Check for multiple spaces
def check_multiple_spaces(column_name):
    return df.filter(regexp_extract(col(column_name), "\\s{2,}", 0) != "").count()

# 3. Check for uppercase letters
def check_uppercase(column_name):
    return df.filter(regexp_extract(col(column_name), "[A-Z]", 0) != "").count()

for column in existing_columns:
    print(f"\nChecking column: {column}")
    print(f"  Rows with non-letter characters: {check_non_letter(column)}")
    print(f"  Rows with multiple spaces: {check_multiple_spaces(column)}")
    print(f"  Rows with uppercase letters: {check_uppercase(column)}")

# Show a sample of cleaned data
print("\nSample of cleaned data:")
df.select(existing_columns).show(5, truncate=False)

# Print column data types
print("\nColumn data types:")
df.printSchema()

# Print summary statistics for string columns
print("\nSummary statistics for string columns:")
df.select([length(col(c)).alias(c) for c in existing_columns]).summary().show()



Columns present in the DataFrame: ['title', 'description']

Checking column: title


  Rows with non-letter characters: 0
  Rows with multiple spaces: 0
  Rows with uppercase letters: 0

Checking column: description


                                                                                

  Rows with non-letter characters: 0


                                                                                

  Rows with multiple spaces: 0


                                                                                

  Rows with uppercase letters: 0

Sample of cleaned data:
+---------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



+-------+------------------+-----------------+
|summary|             title|      description|
+-------+------------------+-----------------+
|  count|             16966|            16966|
|   mean|27.376340917128374|1136.124720028292|
| stddev|12.778408105827449|828.9325114404571|
|    min|                 3|                0|
|    25%|                18|              556|
|    50%|                25|              952|
|    75%|                33|             1501|
|    max|               134|            14292|
+-------+------------------+-----------------+



                                                                                

### Balance the dataset by undersampling the majority class

In [5]:
# First, let's check the class distribution
class_counts = df.groupBy("fraudulent").count().collect()
total_count = sum(row['count'] for row in class_counts)
print("\nClass distribution before balancing:")
for row in class_counts:
    print(f"Class {row['fraudulent']}: {row['count']} ({row['count']/total_count*100:.2f}%)")

# Identify the minority class
minority_class = min(class_counts, key=lambda x: x['count'])['fraudulent']
minority_count = min(class_counts, key=lambda x: x['count'])['count']

# Calculate the fraction for each class
fractions = {
    0: minority_count / df.filter(col("fraudulent") == 0).count(),
    1: minority_count / df.filter(col("fraudulent") == 1).count()
}

# Use sampleBy to balance the dataset
balanced_df = df.sampleBy("fraudulent", fractions, seed=42)

# Check the new class distribution
new_class_counts = balanced_df.groupBy("fraudulent").count().collect()
new_total_count = sum(row['count'] for row in new_class_counts)
print("\nClass distribution after balancing:")
for row in new_class_counts:
    print(f"Class {row['fraudulent']}: {row['count']} ({row['count']/new_total_count*100:.2f}%)")

# Show the resulting schema
print("\nResulting DataFrame schema:")
balanced_df.printSchema()

# Show the first few rows of the balanced dataset
print("\nFirst few rows of the balanced dataset:")
balanced_df.show(5, truncate=False)


Class distribution before balancing:
Class 1: 886 (5.22%)
Class 0: 16080 (94.78%)

Class distribution after balancing:
Class 1: 886 (50.66%)
Class 0: 863 (49.34%)

Resulting DataFrame schema:
root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- fraudulent: integer (nullable = true)


First few rows of the balanced dataset:
+------+---------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Split text into words, remove stopwords, and convert text into vectors (your choice of encoding). Help: feature extractors and feature transformers .

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql.functions import col, udf, concat_ws
from pyspark.sql.types import StringType

# Define the columns we want to process
text_columns = ["title", "description", "requirements"]

# Check which columns actually exist in the DataFrame
existing_columns = [c for c in text_columns if c in balanced_df.columns]

if not existing_columns:
    raise ValueError("None of the specified text columns exist in the DataFrame")

# Concatenate the existing text columns
balanced_df = balanced_df.withColumn("text", concat_ws(" ", *existing_columns))

# Create a pipeline for text processing and feature extraction
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, idf])

# Fit and transform the data
model = pipeline.fit(balanced_df)
processed_df = model.transform(balanced_df)

# Select relevant columns
final_df = processed_df.select("job_id", "features", "fraudulent")

# Show the resulting schema
print("\nResulting DataFrame schema:")
final_df.printSchema()

# Show the first few rows of the processed dataset
print("\nFirst few rows of the processed dataset:")
final_df.show(5, truncate=False)

# Get vocabulary for reference
vocabulary = model.stages[2].vocabulary
print("\nTop 20 words in vocabulary:")
print(vocabulary[:20])

                                                                                


Resulting DataFrame schema:
root
 |-- job_id: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- fraudulent: integer (nullable = true)


First few rows of the processed dataset:
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Perform a random split (70%,30%) of the data into training and test.

In [7]:
# Set a seed for reproducibility
seed = 42

# Perform the random split
train_df, test_df = final_df.randomSplit([0.7, 0.3], seed=seed)

# Cache the DataFrames for faster processing
train_df.cache()
test_df.cache()

# Print the sizes of the resulting datasets
print(f"Full dataset size: {final_df.count()}")
print(f"Training dataset size: {train_df.count()}")
print(f"Test dataset size: {test_df.count()}")

# Show a few examples from each dataset
print("\nSample from training dataset:")
train_df.show(3, truncate=False)

print("\nSample from test dataset:")
test_df.show(3, truncate=False)

## Optionally, you can save these DataFrames for future use
# train_df.write.parquet("dataset/train_data")
# test_df.write.parquet("dataset/test_data")

Full dataset size: 1749


                                                                                

Training dataset size: 1218
Test dataset size: 531

Sample from training dataset:
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
# Set log level to ERROR to suppress WARN messages
spark.sparkContext.setLogLevel("ERROR")

In [9]:
# First, get the input features dimension
num_features = len(train_df.select("features").first()[0])
num_features

10000

In [10]:
from pyspark.ml.classification import LogisticRegression, LinearSVC, RandomForestClassifier, MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
import pandas as pd
from datetime import datetime
import os

def initialize_models(num_features):
    """Initialize models and their parameter grids"""
    return {
        'LogisticRegression': {
            'model': LogisticRegression(labelCol="fraudulent", featuresCol="features"),
            'param_grid': ParamGridBuilder()
                .addGrid(LogisticRegression.regParam, [0.01, 0.1, 0.3])
                .addGrid(LogisticRegression.elasticNetParam, [0.0, 0.5, 1.0])
                .build()
        },
        'LinearSVC': {
            'model': Pipeline(stages=[
                StandardScaler(inputCol="features", 
                             outputCol="scaled_features",
                             withStd=True,
                             withMean=False),
                LinearSVC(labelCol="fraudulent", 
                         featuresCol="scaled_features",
                         standardization=False,
                         regParam=0.01,
                         maxIter=100,
                         threshold=0.0,
                         aggregationDepth=2)
            ]),
            'param_grid': ParamGridBuilder()
                .addGrid(LinearSVC.regParam, [0.1, 0.5, 1.0])
                .addGrid(LinearSVC.maxIter, [100])
                .build()
        },
        'RandomForest': {
            'model': RandomForestClassifier(labelCol="fraudulent", featuresCol="features"),
            'param_grid': ParamGridBuilder()
                .addGrid(RandomForestClassifier.numTrees, [10, 50, 100])
                .addGrid(RandomForestClassifier.maxDepth, [5, 10, 15])
                .build()
        },
        'MultilayerPerceptron': {
            'model': MultilayerPerceptronClassifier(
                labelCol="fraudulent",
                featuresCol="features",
                layers=[num_features, 20, 10, 2]
            ),
            'param_grid': ParamGridBuilder()
                .addGrid(MultilayerPerceptronClassifier.maxIter, [50, 100])
                .addGrid(MultilayerPerceptronClassifier.blockSize, [64, 128])
                .build()
        }
    }


In [11]:
def get_model_params(model):
    """Extract actual parameters from the trained model"""
    try:
        # Handle Pipeline case
        if isinstance(model, Pipeline):
            model = model.stages[-1]
        
        # Define extractors for each model type
        if isinstance(model, LogisticRegression):
            return {
                'regParam': round(float(model.get('regParam')), 3),
                'elasticNetParam': round(float(model.get('elasticNetParam')), 3)
            }
            
        elif isinstance(model, LinearSVC):
            return {
                'regParam': round(float(model.get('regParam')), 3),
                'maxIter': int(model.get('maxIter'))
            }
            
        elif isinstance(model, RandomForestClassifier):
            return {
                'numTrees': int(model.get('numTrees')),
                'maxDepth': int(model.get('maxDepth'))
            }
            
        elif isinstance(model, MultilayerPerceptronClassifier):
            return {
                'layers': model.get('layers'),
                'maxIter': int(model.get('maxIter')),
                'blockSize': int(model.get('blockSize'))
            }
            
    except Exception as e:
        print(f"Error in parameter extraction: {str(e)}")
        return {}
        
    return {}


In [12]:
def train_evaluate_model(model_info, train_df, test_df, evaluator):
    """Train and evaluate a single model with proper parameter extraction"""
    try:
        # Create CrossValidator
        cv = CrossValidator(
            estimator=model_info['model'],
            estimatorParamMaps=model_info['param_grid'],
            evaluator=evaluator.setMetricName("f1"),
            numFolds=10,
            seed=42
        )
        
        # Fit the model
        cv_model = cv.fit(train_df)
        
        # Get the best model and its parameters
        best_model = cv_model.bestModel
        best_params = get_model_params(best_model)
        
        # Get predictions
        predictions = cv_model.transform(test_df)
        
        # Get the best CV score
        cv_f1 = cv_model.avgMetrics[cv_model.avgMetrics.index(max(cv_model.avgMetrics))]
        
        # Calculate test metrics
        test_f1 = evaluator.setMetricName("f1").evaluate(predictions)
        test_accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
        
        # Print debugging information
        print(f"\nBest parameters found:")
        for param, value in best_params.items():
            print(f"  {param}: {value}")
        
        return {
            'parameters': best_params,
            'cv_f1': cv_f1,
            'test_f1': test_f1,
            'test_accuracy': test_accuracy
        }
        
    except Exception as e:
        print(f"Error in model training/evaluation: {str(e)}")
        return None


In [13]:
def save_results(results, filename):
    """Save results to CSV file"""
    try:
        results_df = pd.DataFrame(results)
        results_df['parameters'] = results_df['parameters'].apply(
            lambda x: ', '.join([f'{k}: {v}' for k, v in x.items()])
        )
        
        if os.path.exists(filename):
            results_df.to_csv(filename, mode='a', header=False, index=False)
        else:
            results_df.to_csv(filename, index=False)
            
    except Exception as e:
        print(f"Error saving results: {str(e)}")
def print_results(results):
    """Print formatted results"""
    print("\nFinal Model Comparison:")
    print("="*80)
    for result in results:
        print(f"\nModel: {result['model']}")
        print(f"Parameters: {result['parameters']}")
        print(f"CV F1 Score: {result['cv_f1_score']:.4f}")
        print(f"Test F1 Score: {result['test_f1']:.4f}")
        print(f"Test Accuracy: {result['test_accuracy']:.4f}")
    
    # Print best model
    best_result = max(results, key=lambda x: x['test_f1'])
    print("\nBest Performing Model:")
    print("="*80)
    print(f"Model: {best_result['model']}")
    print(f"Parameters: {best_result['parameters']}")
    print(f"CV F1 Score: {best_result['cv_f1_score']:.4f}")
    print(f"Test F1 Score: {best_result['test_f1']:.4f}")
    print(f"Test Accuracy: {best_result['test_accuracy']:.4f}")

In [14]:
# Main execution
def main():
    # Initialize evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="fraudulent", metricName="f1")
    
    # Get number of features
    num_features = len(train_df.select("features").first()[0])
    
    # Initialize models
    models = initialize_models(num_features)
    
    # Store results
    results = []
    
    # Train and evaluate each model
    for model_name, model_info in models.items():
        print(f"\n{'='*50}")
        print(f"Training {model_name}...")
        print(f"{'='*50}")
        
        result = train_evaluate_model(model_info, train_df, test_df, evaluator)
        
        if result:
            results.append({
                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'model': model_name,
                'parameters': result['parameters'],
                'cv_f1_score': round(result['cv_f1'], 4),
                'test_f1': round(result['test_f1'], 4),
                'test_accuracy': round(result['test_accuracy'], 4)
            })
            
            # Print individual model results
            print("\nModel Evaluation Results:")
            print("="*50)
            print(f"Parameters: {result['parameters']}")
            print(f"CV F1 Score: {result['cv_f1']:.4f}")
            print(f"Test F1 Score: {result['test_f1']:.4f}")
            print(f"Test Accuracy: {result['test_accuracy']:.4f}")
    
    # Save and print final results
    save_results(results, 'model_results_3.csv')
    print_results(results)

if __name__ == "__main__":
    main()


Training LogisticRegression...

Best parameters found:

Model Evaluation Results:
Parameters: {}
CV F1 Score: 0.8098
Test F1 Score: 0.7848
Test Accuracy: 0.7853

Training LinearSVC...

Best parameters found:

Model Evaluation Results:
Parameters: {}
CV F1 Score: 0.8409
Test F1 Score: 0.8343
Test Accuracy: 0.8343

Training RandomForest...


                                                                                


Best parameters found:

Model Evaluation Results:
Parameters: {}
CV F1 Score: 0.7469
Test F1 Score: 0.7520
Test Accuracy: 0.7533

Training MultilayerPerceptron...

Best parameters found:

Model Evaluation Results:
Parameters: {}
CV F1 Score: 0.8299
Test F1 Score: 0.8305
Test Accuracy: 0.8305

Final Model Comparison:

Model: LogisticRegression
Parameters: {}
CV F1 Score: 0.8098
Test F1 Score: 0.7848
Test Accuracy: 0.7853

Model: LinearSVC
Parameters: {}
CV F1 Score: 0.8409
Test F1 Score: 0.8343
Test Accuracy: 0.8343

Model: RandomForest
Parameters: {}
CV F1 Score: 0.7469
Test F1 Score: 0.7520
Test Accuracy: 0.7533

Model: MultilayerPerceptron
Parameters: {}
CV F1 Score: 0.8299
Test F1 Score: 0.8305
Test Accuracy: 0.8305

Best Performing Model:
Model: LinearSVC
Parameters: {}
CV F1 Score: 0.8409
Test F1 Score: 0.8343
Test Accuracy: 0.8343


In [15]:
spark.stop()