In [None]:
import dask.dataframe as dd
import numpy as np
from tqdm import tqdm

# Define the paths to the CSV files
file_paths = [
    "G:/image_features.csv",
    "G:/image_features1.csv",
    "G:/image_features2.csv",
    "G:/image_features3.csv"
]

# Load the CSV files into Dask DataFrames
dfs = [dd.read_csv(file_path) for file_path in file_paths]

# Function to get 25,000 random indexes and retrieve the full rows
def get_random_rows(df, n=25000):
    # Convert the Dask DataFrame to a Pandas DataFrame
    pandas_df = df.compute()
    
    # Get the total number of rows in the DataFrame
    total_rows = len(pandas_df)
    
    # Generate 25,000 random indexes
    random_indexes = np.random.choice(total_rows, n, replace=False)
    
    # Retrieve the full rows for these random indexes
    random_rows = pandas_df.iloc[random_indexes].values.tolist()
    
    return random_rows

# Initialize an empty list to store all rows
all_random_rows = []

# Get the rows from each DataFrame
for df in tqdm(dfs, desc="Processing DataFrames"):
    rows = get_random_rows(df)
    all_random_rows.extend(rows)

# Print the total number of rows collected
print(len(all_random_rows))


Processing DataFrames:  75%|█████████████████████████████████████████████               | 3/4 [18:55<06:13, 373.03s/it]

In [None]:
# Remove the first two items from each element in all_random_rows
new_all_random_rows = [row[2:] for row in all_random_rows]
np.shape(new_all_random_rows)

In [None]:
import pickle
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

# Assuming new_all_random_rows has been populated as described

# Convert new_all_random_rows to a NumPy array
data = np.array(new_all_random_rows)

batch_size = 10000
max_iter = 100

# Apply K-means clustering
kmeans = MiniBatchKMeans(n_clusters=1000, batch_size=batch_size, max_iter=max_iter, random_state=42)
kmeans.fit(data)

# Store the cluster centers
centers = kmeans.cluster_centers_

# Print the shape of the cluster centers
print(f"Shape of the cluster centers: {centers.shape}")

# Save the cluster centers to a pickle file
with open('cluster_centers.pkl', 'wb') as file:
    pickle.dump(centers, file)

print("Cluster centers have been saved to 'cluster_centers.pkl'.")


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load the CIFAR-100 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize the pixel values between 0 and 1
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert the labels to one-hot encoding
y_train = keras.utils.to_categorical(y_train, 100)
y_test = keras.utils.to_categorical(y_test, 100)


In [None]:
from PIL import Image
import numpy as np
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.resnet import ResNet50, preprocess_input
import glob

target_size = (32, 32)  # Change the values as per your requirement
# Load the pre-trained ResNet50 model with modified input shape
model = ResNet50(weights='imagenet', include_top=False, pooling='avg', input_shape=(target_size[0], target_size[1], 3))

ft = model.predict(np.array(x_train).astype("float32"))

from sklearn.cluster import MiniBatchKMeans

n_clusters = 50
batch_size = 100
max_iter = 100

kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, max_iter=max_iter)
kmeans.fit(ft)
# Retrieve the cluster centers
ct = kmeans.cluster_centers_.tolist()

In [None]:
np.shape(ct)

In [None]:
import numpy as np
from tqdm import tqdm

# Initialize a list to store the distances
tot_dist = []

# Calculate L2 distances
for i in tqdm(range(len(ct))):
    distances = []
    for j in range(len(centers)):
        distance = np.linalg.norm(ct[i] - centers[j])
        distances.append(distance)
    tot_dist.append(distances)

# Convert the list to a numpy array
tot_dist = np.array(tot_dist)

# Print the shape of the resulting array
print(f"Shape of the distance array: {tot_dist.shape}")

In [None]:
from ortools.linear_solver import pywraplp
dist_list =  np.transpose(tot_dist, (1, 0))
from tqdm import tqdm

costs = dist_list

num_workers = len(costs)
num_tasks = len(costs[0])
# Create the mip solver with the SCIP backend.
solver = pywraplp.Solver.CreateSolver("SCIP")

# x[i, j] is an array of 0-1 variables, which will be 1
# if worker i is assigned to task j.
x = {}
for i in range(num_workers):
    for j in range(num_tasks):
        x[i, j] = solver.IntVar(0, 1, "")
        
# Each worker is assigned to at most 1 task.
for i in range(num_workers):
    solver.Add(solver.Sum([x[i, j] for j in range(num_tasks)]) <= 1)

# Each task is assigned to exactly one worker.
for j in range(num_tasks):
    solver.Add(solver.Sum([x[i, j] for i in range(num_workers)]) == 1)
    
objective_terms = []
for i in tqdm(range(num_workers)):
    for j in range(num_tasks):
        objective_terms.append(costs[i][j] * x[i, j])
solver.Minimize(solver.Sum(objective_terms))

status = solver.Solve()

sol_indexes = []
if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f"Total cost = {solver.Objective().Value()}\n")
    for i in range(num_workers):
        for j in range(num_tasks):
            # Test if x[i,j] is 1 (with tolerance for floating point arithmetic).
            if x[i, j].solution_value() > 0.1:
                print(f"Cluster {j} assigned to Class {i}." + f" Cost: {costs[i][j]}")
                sol_indexes.append(i)
else:
    print("No solution found.")

In [None]:
len(sol_indexes)

In [None]:
with open('cvfinal.pkl', 'wb') as file:
    pickle.dump(sol_indexes, file)

print("Indexes have been saved to 'cvfinal.pkl'.")

Time taken: 1.6200830936431885 seconds


In [1]:
import dask.dataframe as dd
import numpy as np
from scipy.spatial.distance import cdist
from dask.distributed import Client
import pickle
import time
from tqdm import tqdm
import dask

# Initialize Dask client
client = Client()

# Function to compute closest indexes
def compute_closest_indexes(chunk, centers, index_set):
    result_list = []
    for idx, row in chunk.iterrows():
        features = np.array(row[2:], dtype=float).reshape(1, -1)
        distances = cdist(features, centers, metric='euclidean')
        closest_center_idx = np.argmin(distances)
        if closest_center_idx in index_set:
            result_list.append(row[1])
    return result_list

# Function to process elements of all DataFrames until the result_list reaches 500,000
def process_elements(dfs, centers, index_set, max_len=500000):
    result_list = []
    delayed_results = []

    for df in tqdm(dfs, desc="Processing DataFrames"):
        print(f"Res {len(result_list)}")
        # Iterate over chunks of the DataFrame
        for chunk in tqdm(df.to_delayed(), desc="Processing chunks", leave=False):
            # Process each chunk in parallel
            delayed_result = dask.delayed(compute_closest_indexes)(chunk, centers, index_set)
            delayed_results.append(delayed_result)
            # Check if we have reached the max length
            if len(result_list) >= max_len:
                break

        # Compute and collect results
        if delayed_results:
            chunk_results = dask.compute(*delayed_results)
            for chunk_result in chunk_results:
                result_list.extend(chunk_result)
                if len(result_list) >= max_len:
                    return result_list[:max_len]

    return result_list

# Measure the execution time
start_time = time.time()

# Load the pickle files
with open("F:/ML_notebooks/cvfinal.pkl", 'rb') as file:
    index = pickle.load(file)
with open("F:/ML_notebooks/cluster_centers.pkl", 'rb') as file:
    centers = pickle.load(file)

# Convert index to a set for faster lookup
index_set = set(index)

# Define the paths to the CSV files
file_paths = [
    "G:/image_features.csv",
    "G:/image_features1.csv",
    "G:/image_features2.csv",
    "G:/image_features3.csv"
]

# Load the CSV files into Dask DataFrames
dfs = [dd.read_csv(file_path) for file_path in file_paths]

# Call the function to process elements of each DataFrame
result_list = process_elements(dfs, centers, index_set)

# Save the result_list in a pickle file
with open("cifar_images.pkl", "wb") as file:
    pickle.dump(result_list, file)

end_time = time.time()

# Print the execution time
print(f"Execution time: {end_time - start_time} seconds")


Processing DataFrames:   0%|                                                       | 0/4 [00:00<?, ?it/s]

Res 0



Processing chunks:   0%|                                                          | 0/97 [00:00<?, ?it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames:  25%|███████████▌                                  | 1/4 [13:45<41:15, 825.25s/it]

Res 203124



Processing chunks:   0%|                                                         | 0/101 [00:00<?, ?it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames:  25%|██████████▊                                | 1/4 [40:45<2:02:15, 2445.16s/it]


Execution time: 2454.3678781986237 seconds
