In [3]:
import dask.dataframe as dd
import numpy as np
from tqdm import tqdm

# Define the paths to the CSV files
file_paths = [
    "G:/image_features.csv",
    "G:/image_features1.csv",
    "G:/image_features2.csv",
    "G:/image_features3.csv"
]

# Load the CSV files into Dask DataFrames
dfs = [dd.read_csv(file_path) for file_path in file_paths]

# Function to get 25,000 random indexes and retrieve the full rows
def get_random_rows(df, n=25000):
    # Convert the Dask DataFrame to a Pandas DataFrame
    pandas_df = df.compute()
    
    # Get the total number of rows in the DataFrame
    total_rows = len(pandas_df)
    
    # Generate 25,000 random indexes
    random_indexes = np.random.choice(total_rows, n, replace=False)
    
    # Retrieve the full rows for these random indexes
    random_rows = pandas_df.iloc[random_indexes].values.tolist()
    
    return random_rows

# Initialize an empty list to store all rows
all_random_rows = []

# Get the rows from each DataFrame
for df in tqdm(dfs, desc="Processing DataFrames"):
    rows = get_random_rows(df)
    all_random_rows.extend(rows)

# Print the total number of rows collected
print(len(all_random_rows))


Processing DataFrames: 100%|████████████████████████████████████████████████████████████| 4/4 [18:31<00:00, 277.99s/it]

100000





In [4]:
# Remove the first two items from each element in all_random_rows
new_all_random_rows = [row[2:] for row in all_random_rows]
np.shape(new_all_random_rows)

(100000, 2048)

In [5]:
import pickle
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

# Assuming new_all_random_rows has been populated as described

# Convert new_all_random_rows to a NumPy array
data = np.array(new_all_random_rows)

batch_size = 10000
max_iter = 100

# Apply K-means clustering
kmeans = MiniBatchKMeans(n_clusters=1000, batch_size=batch_size, max_iter=max_iter, random_state=42)
kmeans.fit(data)

# Store the cluster centers
centers = kmeans.cluster_centers_

# Print the shape of the cluster centers
print(f"Shape of the cluster centers: {centers.shape}")

# Save the cluster centers to a pickle file
with open('cluster_centers.pkl', 'wb') as file:
    pickle.dump(centers, file)

print("Cluster centers have been saved to 'cluster_centers.pkl'.")


  super()._check_params_vs_input(X, default_n_init=3)


Shape of the cluster centers: (1000, 2048)
Cluster centers have been saved to 'cluster_centers.pkl'.


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Load the CIFAR-100 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Normalize the pixel values between 0 and 1
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert the labels to one-hot encoding
y_train = keras.utils.to_categorical(y_train, 100)
y_test = keras.utils.to_categorical(y_test, 100)


In [7]:
from PIL import Image
import numpy as np
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.resnet import ResNet50, preprocess_input
import glob

target_size = (32, 32)  # Change the values as per your requirement
# Load the pre-trained ResNet50 model with modified input shape
model = ResNet50(weights='imagenet', include_top=False, pooling='avg', input_shape=(target_size[0], target_size[1], 3))

ft = model.predict(np.array(x_train).astype("float32"))

from sklearn.cluster import MiniBatchKMeans

n_clusters = 400
batch_size = 100
max_iter = 100

kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, max_iter=max_iter)
kmeans.fit(ft)
# Retrieve the cluster centers
ct = kmeans.cluster_centers_.tolist()

  super()._check_params_vs_input(X, default_n_init=3)


In [8]:
np.shape(ct)

(400, 2048)

In [9]:
import numpy as np
from tqdm import tqdm

# Initialize a list to store the distances
tot_dist = []

# Calculate L2 distances
for i in tqdm(range(len(ct))):
    distances = []
    for j in range(len(centers)):
        distance = np.linalg.norm(ct[i] - centers[j])
        distances.append(distance)
    tot_dist.append(distances)

# Convert the list to a numpy array
tot_dist = np.array(tot_dist)

# Print the shape of the resulting array
print(f"Shape of the distance array: {tot_dist.shape}")

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [00:47<00:00,  8.47it/s]

Shape of the distance array: (400, 1000)





In [10]:
from ortools.linear_solver import pywraplp
dist_list =  np.transpose(tot_dist, (1, 0))
from tqdm import tqdm

costs = dist_list

num_workers = len(costs)
num_tasks = len(costs[0])
# Create the mip solver with the SCIP backend.
solver = pywraplp.Solver.CreateSolver("SCIP")

# x[i, j] is an array of 0-1 variables, which will be 1
# if worker i is assigned to task j.
x = {}
for i in range(num_workers):
    for j in range(num_tasks):
        x[i, j] = solver.IntVar(0, 1, "")
        
# Each worker is assigned to at most 1 task.
for i in range(num_workers):
    solver.Add(solver.Sum([x[i, j] for j in range(num_tasks)]) <= 1)

# Each task is assigned to exactly one worker.
for j in range(num_tasks):
    solver.Add(solver.Sum([x[i, j] for i in range(num_workers)]) == 1)
    
objective_terms = []
for i in tqdm(range(num_workers)):
    for j in range(num_tasks):
        objective_terms.append(costs[i][j] * x[i, j])
solver.Minimize(solver.Sum(objective_terms))

status = solver.Solve()

sol_indexes = []
if status == pywraplp.Solver.OPTIMAL or status == pywraplp.Solver.FEASIBLE:
    print(f"Total cost = {solver.Objective().Value()}\n")
    for i in range(num_workers):
        for j in range(num_tasks):
            # Test if x[i,j] is 1 (with tolerance for floating point arithmetic).
            if x[i, j].solution_value() > 0.1:
                print(f"Cluster {j} assigned to Class {i}." + f" Cost: {costs[i][j]}")
                sol_indexes.append(i)
else:
    print("No solution found.")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:35<00:00, 27.81it/s]


Total cost = 34849.1477692988

Cluster 258 assigned to Class 1. Cost: 72.4741963365812
Cluster 156 assigned to Class 3. Cost: 94.81770189599621
Cluster 38 assigned to Class 4. Cost: 67.62513828161217
Cluster 18 assigned to Class 5. Cost: 99.71419070540072
Cluster 250 assigned to Class 7. Cost: 77.56734502728682
Cluster 379 assigned to Class 9. Cost: 85.95481599334808
Cluster 131 assigned to Class 10. Cost: 96.82082589561405
Cluster 81 assigned to Class 11. Cost: 83.23474367071579
Cluster 330 assigned to Class 12. Cost: 89.85992665338985
Cluster 342 assigned to Class 13. Cost: 97.68242741567232
Cluster 273 assigned to Class 15. Cost: 92.18520593956254
Cluster 187 assigned to Class 17. Cost: 70.95434333418568
Cluster 11 assigned to Class 18. Cost: 73.40117176791223
Cluster 200 assigned to Class 20. Cost: 88.25364549060023
Cluster 88 assigned to Class 21. Cost: 92.76846237132023
Cluster 225 assigned to Class 27. Cost: 79.01357612185525
Cluster 284 assigned to Class 28. Cost: 97.6161757290

Cluster 335 assigned to Class 821. Cost: 85.58309993214087
Cluster 172 assigned to Class 823. Cost: 78.67928109897555
Cluster 120 assigned to Class 825. Cost: 86.50832306712468
Cluster 253 assigned to Class 832. Cost: 84.39559613267117
Cluster 371 assigned to Class 834. Cost: 77.494028049695
Cluster 204 assigned to Class 836. Cost: 92.86625600382851
Cluster 13 assigned to Class 838. Cost: 86.84863031017761
Cluster 97 assigned to Class 839. Cost: 92.93403245868637
Cluster 122 assigned to Class 841. Cost: 90.46365333311196
Cluster 110 assigned to Class 843. Cost: 90.08984721083553
Cluster 157 assigned to Class 849. Cost: 85.73438771866394
Cluster 43 assigned to Class 851. Cost: 91.87545499713255
Cluster 99 assigned to Class 856. Cost: 84.57514705151775
Cluster 221 assigned to Class 857. Cost: 86.84573281388926
Cluster 215 assigned to Class 859. Cost: 84.29843899959208
Cluster 165 assigned to Class 861. Cost: 92.67550147615391
Cluster 308 assigned to Class 865. Cost: 99.21319958935577
Clu

In [11]:
len(sol_indexes)

400

In [12]:
with open('cvfinal.pkl', 'wb') as file:
    pickle.dump(sol_indexes, file)

print("Indexes have been saved to 'cvfinal.pkl'.")

Indexes have been saved to 'cvfinal.pkl'.


Time taken: 1.6200830936431885 seconds


In [1]:
import dask.dataframe as dd
import numpy as np
from scipy.spatial.distance import cdist
from dask.distributed import Client
import pickle
import time
from tqdm import tqdm
import dask

# Initialize Dask client
client = Client()

# Function to compute closest indexes
def compute_closest_indexes(chunk, centers, index_set):
    result_list = []
    for idx, row in chunk.iterrows():
        features = np.array(row[2:], dtype=float).reshape(1, -1)
        distances = cdist(features, centers, metric='euclidean')
        closest_center_idx = np.argmin(distances)
        if closest_center_idx in index_set:
            result_list.append((closest_center_idx, row[1]))
    return result_list

# Function to process elements of all DataFrames
def process_elements(dfs, centers, index_set, max_len_per_index=500):
    result_list = []
    index_count = {idx: 0 for idx in index_set}
    delayed_results = []

    for df in tqdm(dfs, desc="Processing DataFrames"):
        print(len(result_list))
        # Iterate over chunks of the DataFrame
        for chunk in tqdm(df.to_delayed(), desc="Processing chunks", leave=False):
            # Process each chunk in parallel
            delayed_result = dask.delayed(compute_closest_indexes)(chunk, centers, index_set)
            delayed_results.append(delayed_result)

        # Compute and collect results
        if delayed_results:
            chunk_results = dask.compute(*delayed_results)
            for chunk_result in chunk_results:
                for closest_center_idx, row_1 in chunk_result:
                    if index_count[closest_center_idx] < max_len_per_index:
                        result_list.append(row_1)
                        index_count[closest_center_idx] += 1
                        if index_count[closest_center_idx] >= max_len_per_index:
                            index_set.remove(closest_center_idx)
                        if all(count >= max_len_per_index for count in index_count.values()):
                            return result_list

    return result_list

# Measure the execution time
start_time = time.time()

# Load the pickle files
with open("F:/ML_notebooks/cvfinal.pkl", 'rb') as file:
    index = pickle.load(file)
with open("F:/ML_notebooks/cluster_centers.pkl", 'rb') as file:
    centers = pickle.load(file)

# Convert index to a set for faster lookup
index_set = set(index)

# Define the paths to the CSV files
file_paths = [
    "G:/image_features.csv",
    "G:/image_features1.csv",
    "G:/image_features2.csv",
    "G:/image_features3.csv"
]

# Load the CSV files into Dask DataFrames
dfs = [dd.read_csv(file_path) for file_path in file_paths]

# Call the function to process elements of each DataFrame
result_list = process_elements(dfs, centers, index_set)

# Save the result_list in a pickle file
with open("cifar_images.pkl", "wb") as file:
    pickle.dump(result_list, file)

end_time = time.time()

# Print the execution time
print(f"Execution time: {end_time - start_time} seconds")


Processing DataFrames:   0%|                                                                     | 0/4 [00:00<?, ?it/s]

0



Processing chunks:   0%|                                                                        | 0/97 [00:00<?, ?it/s][A
Processing chunks:  49%|██████████████████████████████▋                               | 48/97 [00:00<00:00, 462.78it/s][A
Processing chunks:  98%|████████████████████████████████████████████████████████████▋ | 95/97 [00:00<00:00, 451.70it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames:  25%|███████████████                                             | 1/4 [14:43<44:10, 883.49s/it]

145498



Processing chunks:   0%|                                                                       | 0/101 [00:00<?, ?it/s][A
Processing chunks:  96%|██████████████████████████████████████████████████████████▌  | 97/101 [00:00<00:00, 954.96it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames:  50%|█████████████████████████████▌                             | 2/4 [43:57<46:31, 1395.57s/it]

196087



Processing chunks:   0%|                                                                        | 0/57 [00:00<?, ?it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames:  75%|██████████████████████████████████████████▊              | 3/4 [1:19:11<28:43, 1723.77s/it]

198356



Processing chunks:   0%|                                                                        | 0/43 [00:00<?, ?it/s][A
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Processing DataFrames: 100%|█████████████████████████████████████████████████████████| 4/4 [2:00:42<00:00, 1810.68s/it]

Execution time: 7249.91442322731 seconds





In [None]:
print(len(result_list))