In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Preprocessing using Python**

1.   Load the original image
2.   Convert the image to float32
3.   Convert the image to grayscale
4.   Apply Difference of Gaussians (DoG) filter for high-frequency details
5.   Normalize the DoG filter
6.   Apply brightness adjustment
7.   Apply unsharp masking
8.   Normalize the unsharp mask
9.   Apply adaptive histogram equalization
10.  Save the preprocessed images in a new folder






In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

def preprocess_image(image_path, dog_sigma1=1.0, dog_sigma2=2.0, brightness_factor=1.5, unsharp_strength=1.5):
    original_image = cv2.imread(image_path)

    original_image = original_image.astype(np.float32) / 255.0

    gray_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)

    blurred1 = cv2.GaussianBlur(gray_image, (0, 0), sigmaX=dog_sigma1)
    blurred2 = cv2.GaussianBlur(gray_image, (0, 0), sigmaX=dog_sigma2)
    dog_filter = blurred1 - blurred2

    normalized_dog = (dog_filter - np.min(dog_filter)) / (np.max(dog_filter) - np.min(dog_filter))

    enhanced_image = np.clip(brightness_factor * normalized_dog, 0, 1)

    blurred = cv2.GaussianBlur(enhanced_image, (0, 0), sigmaX=1.5)
    unsharp_mask = enhanced_image - unsharp_strength * blurred

    enhanced_image = np.clip(enhanced_image + unsharp_mask, 0, 1)

    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced_image = clahe.apply((enhanced_image * 255).astype(np.uint8))

    return original_image, enhanced_image

def preprocess_and_save(input_dir, output_dir, dog_sigma1=1.0, dog_sigma2=2.0, brightness_factor=1.5, unsharp_strength=1.5):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    files = os.listdir(input_dir)

    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(input_dir, file)

            original_image, enhanced_image = preprocess_image(image_path, dog_sigma1, dog_sigma2, brightness_factor, unsharp_strength)

            output_path = os.path.join(output_dir, file)
            cv2.imwrite(output_path, enhanced_image.astype(np.uint8))


input_directory = '/content/drive/MyDrive/BigData/Filtered_189'
output_directory = '/content/drive/MyDrive/BigData/Filtered_PREPROCESSED'

preprocess_and_save(input_directory, output_directory, dog_sigma1=1.5, dog_sigma2=6.0, brightness_factor=1.0, unsharp_strength=0.1)


# **Feature Extraction from Images**

In [None]:
pip install tensorflow pandas scikit-learn



In [None]:
pip install tensorflow



In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
import os
import numpy as np
import pandas as pd

data = pd.read_csv('/content/unique_ids_output.csv')

data['image_path'] = '/content/drive/MyDrive/BigData/Filtered_PREPROCESSED/' + data['imageName']

def load_and_extract_features(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    model = VGG19(weights='imagenet', include_top=False, pooling='avg')
    features = model.predict(img_array)
    return features.flatten()

data['features'] = data['image_path'].apply(lambda x: load_and_extract_features(x))

feature_columns = [f'feature_{i}' for i in range(len(data['features'][0]))]
data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)

data = data.drop(columns=['image_path', 'features'])




  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data.index)
  data[feature_columns] = pd.DataFrame(data['features'].tolist(), index=data

In [None]:
# Save the DataFrame with features to a new Excel file
data.to_csv('/content/features_VGG.csv', index=False)

# **Using Pyspark MLlib for Prediction**

In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=3978ad999c0ca30a1c156b5545f8c371c373de229dacd3872e8e6f3accaf75c5
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [28]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline  # Import the Pipeline module

# Create a Spark session
spark = SparkSession.builder.appName("MLPExample").getOrCreate()

csv_file_path = '/content/features_VGG.csv'

spark_df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

columns_to_drop = ['id', 'imageName']
spark_df = spark_df.drop(*columns_to_drop)

gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
spark_df = gender_indexer.fit(spark_df).transform(spark_df)

categorical_columns = ['skinColor', 'aspectOfHand']  # Add more columns as needed
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index").fit(spark_df) for col in categorical_columns]
pipeline = Pipeline(stages=indexers)
spark_df = pipeline.fit(spark_df).transform(spark_df)

spark_df = spark_df.drop(*categorical_columns)

feature_columns = [col for col in spark_df.columns if col != 'gender' and col != 'gender_index']

vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
spark_df = vector_assembler.transform(spark_df)

input_size = len(feature_columns)
output_size = 2
layers = [input_size, 64, 32, output_size]

(train_data, test_data) = spark_df.randomSplit([0.8, 0.2], seed=1234)

mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="gender_index",
    layers=layers,
    blockSize=128,
    maxIter=20,
    seed=1234
)

mlp_model = mlp.fit(train_data)

train_predictions = mlp_model.transform(train_data)

# Evaluate the model on train set
train_accuracy = evaluator.evaluate(train_predictions)
print(f"Train Accuracy: {train_accuracy}")

predictions = mlp_model.transform(test_data)

# Evaluate the model on test set
test_accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {test_accuracy}")

Train Accuracy: 0.9432624113475178
Test Accuracy: 0.875


In [29]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator


evaluator_precision = MulticlassClassificationEvaluator(labelCol="gender_index", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)
print(f"Precision: {precision}")

evaluator_recall = MulticlassClassificationEvaluator(labelCol="gender_index", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)
print(f"Recall: {recall}")

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="gender_index", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1}")

evaluator_auc = BinaryClassificationEvaluator(labelCol="gender_index", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)
print(f"AUC: {auc}")

spark.stop()


Precision: 0.8719287469287469
Recall: 0.875
F1 Score: 0.8715277777777778
AUC: 0.9252747252747252
