## **DATA 6250**
# **Machine Learning for Data Science**
## **Final Project**
## **Pre-Processing of Data**
## **Filling of Missing Values in Data**
### ***REFERENCE: EPOCH AI***
### ***Links to Dataset:***
- *Notable AI Models* : https://epoch.ai/data/notable_ai_models.csv
- *Large-Scale AI Models* : https://epoch.ai/data/large_scale_ai_models.csv
- *ML Hardware* : https://epoch.ai/data/ml_hardware.csv

#### Done By: Rohan Pratap Reddy Ravula
#### School of Computing and Data Science
#### Wentworth Institute of Technology

## Install the google-drive file

In [None]:
!pip install google-drive



## mount google drive folder to colab notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import required libraries

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
tqdm.pandas()

## import the required files

In [None]:
input_path = "/content/drive/MyDrive/DATA 6250/Datasets/Updated/Final/ai_models_final.csv"
df = pd.read_csv(input_path)

In [None]:
empty_cols = df.columns[df.isna().any()]
print(empty_cols)

Index(['Hardware unit Release Price (USD)', 'Hardware unit TDP (W)',
       'Hardware unit Compute FLOPS', 'Hardware unit Compute OPS',
       'Hardware unit Memory Size (bytes)', 'Notability', 'Epochs',
       'Batch size', 'Power draw (W)', 'Training compute cost'],
      dtype='object')


In [None]:
num_cols = df[empty_cols].select_dtypes(include=np.number).columns
print(num_cols)
obj_cols = df[empty_cols].select_dtypes(include="object").columns
print(obj_cols)

Index(['Hardware unit Release Price (USD)', 'Hardware unit TDP (W)',
       'Hardware unit Compute FLOPS', 'Hardware unit Compute OPS',
       'Hardware unit Memory Size (bytes)', 'Epochs', 'Batch size',
       'Power draw (W)', 'Training compute cost'],
      dtype='object')
Index(['Notability'], dtype='object')


## Import the sentence transformer - 'all-mpnet-base-v2'

In [None]:
!pip install -U sentence-transformers torch



## Load the libraries

In [None]:
from sentence_transformers import SentenceTransformer,util
import torch

## Set device to cuda abd load the sentence transformer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [None]:
sen_model = SentenceTransformer('all-mpnet-base-v2',device=device)

## Create dimentional embeddings

In [None]:
embeddings = sen_model.encode(df['Model'].tolist(),convert_to_tensor=True)
df['Embeddings'] = [row for row in embeddings.cpu()]

## Create a function to find unmasked embeddings

In [None]:
def get_unmasked_embeddings(unmasked_df):
    if unmasked_df.empty:
        return None, unmasked_df
    unmasked_df = unmasked_df.reset_index(drop=True)
    emb_list = []
    for emb in unmasked_df['Embeddings']:
        if not isinstance(emb, torch.Tensor):
            emb = torch.tensor(emb)
        emb_list.append(emb)
    unmask_embeddings = torch.stack(emb_list, dim=0)
    return unmask_embeddings, unmasked_df

## Create a function to fill the values

In [None]:
def assign_values_to_features(row_vals, unmask_df, feature, confidence_val=0.5):
    emb = row_vals['Embeddings']
    if not isinstance(emb, torch.Tensor):
        emb = torch.tensor(emb)
    emb = emb.unsqueeze(0)
    unmask_embeddings, unmask = get_unmasked_embeddings(unmask_df)
    if unmask_embeddings is None:
        return None
    similarities = util.cos_sim(emb, unmask_embeddings)[0]
    max_val, max_idx = torch.max(similarities, dim=0)
    max_val = max_val.item()
    max_idx = max_idx.item()
    if max_val > confidence_val:
        return unmask.iloc[max_idx][feature]
    return None

## Function to fill based on cosine similarity

In [None]:
def iter_based_sim_filling_num(df_new,feature,confidence_val=0.5,iter=3):
  for i in tqdm(range(iter)):
    mask = df_new[df_new[feature].isna()].copy()
    unmask = df_new[~df_new[feature].isna()].copy()
    mask[feature] = mask.progress_apply(lambda row: assign_values_to_features(row,unmask,feature,confidence_val))
    df_new.loc[mask.index, feature] = mask[feature]
    df_new[feature] = pd.to_numeric(df_new[feature], errors='coerce')
  return df_new

In [None]:
df.columns

Index(['Model', 'Domain', 'Country', 'Organization', 'Date', 'Category',
       'Task', 'Confidence', 'Hardware quantity', 'accessibility',
       'Training dataset', 'Training code accessibility', 'Parameters',
       'data size', 'Training time (hours)', 'Training compute (FLOP)',
       'Finetune compute (FLOP)', 'Hardware unit Release Price (USD)',
       'Hardware unit TDP (W)', 'Hardware unit Compute FLOPS',
       'Hardware unit Compute OPS', 'Hardware unit Memory Size (bytes)',
       'Training hardware', 'Notability', 'Epochs', 'Batch size',
       'Power draw (W)', 'Training compute cost', 'Embeddings'],
      dtype='object')

In [None]:
mask =  df[df['Training compute cost'].isna()]
print(mask.shape)

(1705, 29)


In [None]:
print(df.shape)

(1705, 29)


## Code for automation for filling missing values

In [None]:
empty_cols = [col for col in empty_cols if col not in ['Training compute cost']]
for feature in tqdm(empty_cols):
    if feature in num_cols:
        for i in tqdm(range(3)):
            mask = df[df[feature].isna()]
            if mask.empty:
                break
            unmask = df[~df[feature].isna()]
            if unmask.empty:  # Skip if no unmasked data
                break
            mask[feature] = mask.progress_apply(lambda row: assign_values_to_features(row, unmask, feature, 0.5), axis=1)
            df.loc[mask.index, feature] = mask[feature]
            df[feature] = pd.to_numeric(df[feature], errors='coerce')
    if feature in obj_cols:
        mask = df[df[feature].isna()]
        if mask.empty:
            continue
        unmask = df[~df[feature].isna()]
        if unmask.empty:
            df[feature] = df[feature].fillna('Not-defined')
            continue
        mask[feature] = mask.progress_apply(lambda row: assign_values_to_features(row, unmask, feature, 0.5), axis=1)
        df.loc[mask.index, feature] = mask[feature]
        df[feature] = df[feature].astype(str)

    # Fallback for remaining NaNs
    if df[feature].isna().any():
        if feature in num_cols:
            mask = df[df[feature].isna()]
            unmask = df[~df[feature].isna()]
            if unmask.empty:
                df.loc[mask.index, feature] = 0  # Default value if no non-NaN data
            else:
                rms_value = np.sqrt((unmask[feature] ** 2).mean())
                df.loc[mask.index, feature] = rms_value
            df[feature] = pd.to_numeric(df[feature], errors='coerce')
        if feature in obj_cols:
            df[feature] = df[feature].fillna('Not-defined')


  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/419 [00:00<?, ?it/s][A[A

 11%|█         | 47/419 [00:00<00:00, 469.04it/s][A[A

 22%|██▏       | 94/419 [00:00<00:00, 461.60it/s][A[A

 34%|███▎      | 141/419 [00:00<00:00, 462.31it/s][A[A

 45%|████▍     | 188/419 [00:00<00:00, 456.13it/s][A[A

 56%|█████▌    | 234/419 [00:00<00:00, 448.59it/s][A[A

 67%|██████▋   | 279/419 [00:00<00:00, 445.45it/s][A[A

 77%|███████▋  | 324/419 [00:00<00:00, 445.68it/s][A[A

 89%|████████▉ | 372/419 [00:00<00:00, 454.71it/s][A[A

100%|██████████| 419/419 [00:00<00:00, 449.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mask[feature] = mask.progress_apply(lambda row: assign_values_to_features(row, unmask, feature,

## Drop the embeddings

In [None]:
df.drop(columns=['Embeddings'],inplace=True)

In [None]:
df.columns[df.isna().any()]

Index(['Training compute cost'], dtype='object')

## Store the data

In [None]:
output_path_large_models = "/content/drive/MyDrive/DATA 6250/Datasets/Updated/Final/large_scale_ai_models_filled_new.csv"
if os.path.exists(output_path_large_models):
    os.remove(output_path_large_models)
path = os.path.dirname(output_path_large_models)
if not os.path.exists(path):
    os.makedirs(path)
df.to_csv(output_path_large_models, index=False)

## Get the dimentional embeddings

In [None]:
df_new = pd.read_csv('notable_ai_models_normalized.csv')

In [None]:
df_emb = sen_model.encode(df['Model'].tolist(),convert_to_tensor=True,show_progress_bar=True)
df_new_model = sen_model.encode(df_new['Model'].tolist(),convert_to_tensor=True,show_progress_bar=True)
df_new_base = sen_model.encode(df_new['Base model'].astype(str).tolist(),convert_to_tensor=True,show_progress_bar=True)

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

Batches:   0%|          | 0/179 [00:00<?, ?it/s]

## Find dot produdct matrix

In [None]:
sim_model = util.cos_sim(df_emb,df_new_model)
sim_base = util.cos_sim(df_emb,df_new_base)

## Find the best matches

In [None]:
best_matches_model = torch.argmax(sim_model,dim=1)
best_matches_base = torch.argmax(sim_base,dim=1)

In [None]:
df_new.columns

Index(['Model', 'Domain', 'Organization', 'Country', 'Date', 'Notability',
       'Training compute (FLOP)', 'Finetune compute (FLOP)', 'data size',
       'Epochs', 'Batch size', 'Training time (hours)', 'Power draw (W)',
       'Training compute cost', 'Confidence', 'Training hardware',
       'Hardware quantity', 'Hardware utilization', 'Category', 'Authors',
       'Notability criteria notes', 'Parameters', 'Parameters notes',
       'Training compute notes', 'Training dataset', 'Training dataset notes',
       'Dataset size notes', 'Training time notes',
       'Training compute cost (2023 USD)', 'Compute cost notes', 'Abstract',
       'Base model', 'Finetune compute notes', 'Batch size notes',
       'Model accessibility', 'Training code accessibility',
       'Inference code accessibility', 'Accessibility notes', 'Frontier model',
       'Training compute estimation method'],
      dtype='object')

In [None]:
df['Hardware utilization'] = None
for i, best_match_index in enumerate(best_matches_model):
    similarity_score = sim_model[i][best_match_index].item()
    if similarity_score > 0.7:
      best_match_index = best_match_index.cpu().item()
      df.loc[i, 'Hardware utilization'] = df_new.loc[best_match_index, 'Hardware utilization']


In [None]:
mask = df[df['Hardware utilization'].isna()]
unmask = df[~df['Hardware utilization'].isna()]
len(mask)

1476

In [None]:
len(df)

1705

## Fill the values based on cosine similarity

In [None]:
mask = mask.reset_index(drop=True)
mask_emb = sen_model.encode(mask['Model'].tolist(),convert_to_tensor=True,show_progress_bar=True)
sim_base= util.cos_sim(mask_emb,df_new_base)
best_matches_base = torch.argmax(sim_base,dim=1)
for i, best_match_index in enumerate(best_matches_base):
    similarity_score = sim_base[i][best_match_index].item()
    if similarity_score > 0.7:
      best_match_index = best_match_index.cpu().item()
      mask.loc[i,'Hardware utilization'] = df_new.loc[best_match_index,'Hardware utilization']


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [None]:
df_merged = df.merge(
    mask[['Model', 'Hardware utilization']],
    on='Model',
    how='left',
    suffixes=('', '_mask')
)
df_merged['Hardware utilization'] = df_merged['Hardware utilization'].fillna(df_merged['Hardware utilization_mask'])
df_merged.drop(columns=['Hardware utilization_mask'],inplace=True)

  df_merged['Hardware utilization'] = df_merged['Hardware utilization'].fillna(df_merged['Hardware utilization_mask'])


In [None]:
df.columns

Index(['Model', 'Domain', 'Country', 'Organization', 'Date', 'Category',
       'Task', 'Confidence', 'Hardware quantity', 'accessibility',
       'Training dataset', 'Training code accessibility', 'Parameters',
       'data size', 'Training time (hours)', 'Training compute (FLOP)',
       'Finetune compute (FLOP)', 'Hardware unit Release Price (USD)',
       'Hardware unit TDP (W)', 'Hardware unit Compute FLOPS',
       'Hardware unit Compute OPS', 'Hardware unit Memory Size (bytes)',
       'Training hardware', 'Notability', 'Epochs', 'Batch size',
       'Power draw (W)', 'Training compute cost', 'Hardware utilization'],
      dtype='object')

In [None]:
def fill_util(row):
  if pd.notna(row['Hardware utilization']):
    return row['Hardware utilization']
  else:
    compute_total = row['Training compute (FLOP)'] + row['Finetune compute (FLOP)']
    estimated_compute = row['Hardware quantity'] * (row['Hardware unit Compute FLOPS']) + 1e-8
    estimated_util = estimated_compute / compute_total
    return estimated_util


In [None]:
def fill_new(row):
  if row['Hardware utilization'] >= 0.1:
    return row['Hardware utilization']
  else:
    compute_total = row['Training compute (FLOP)'] + row['Finetune compute (FLOP)']
    estimated_compute =  (row['Hardware unit Compute FLOPS']) + 1e-8
    estimated_util = estimated_compute / compute_total
    return estimated_util

In [None]:
df['Hardware utilization'] = df.progress_apply(fill_new, axis=1)

100%|██████████| 1705/1705 [00:00<00:00, 90977.52it/s]


In [None]:
df = df.progress_apply(calc_cos,axis=1)

100%|██████████| 1705/1705 [00:02<00:00, 783.20it/s]


In [None]:
df['Epochs'] = df['Epochs'].astype(int)
df['Batch size'] = df['Batch size'].astype(int)

In [None]:
in_path_new = "/content/drive/MyDrive/DATA 6250/Datasets/Original/large_scale_ai_models.csv"
df_new = pd.read_csv(in_path_new)

In [None]:
df_new = df_new[['Model','Abstract','Link','Reference']].copy()
df_new.shape

(290, 4)

In [None]:
df = pd.merge(df,df_new,on='Model',how='left')

In [None]:
obj_cols = df.select_dtypes(include="object").columns
num_cols = df.select_dtypes(include=np.number).columns
out_cols = ['Parameters','Training compute (FLOP)', 'Finetune compute (FLOP)','Power draw (W)','Training compute cost']
num_cols = [col for col in num_cols if col not in out_cols]
print(obj_cols)
print(num_cols)
print(out_cols)

Index(['Model', 'Domain', 'Country', 'Organization', 'Date', 'Category',
       'Task', 'Confidence', 'accessibility', 'Training dataset',
       'Training code accessibility', 'Training hardware', 'Notability',
       'text', 'Abstract', 'Link', 'Reference'],
      dtype='object')
['Hardware quantity', 'data size', 'Training time (hours)', 'Hardware unit Release Price (USD)', 'Hardware unit TDP (W)', 'Hardware unit Compute FLOPS', 'Hardware unit Compute OPS', 'Hardware unit Memory Size (bytes)', 'Epochs', 'Batch size', 'Hardware utilization', 'Expected power (W)', 'Training Energy', 'Expected Energy']
['Parameters', 'Training compute (FLOP)', 'Finetune compute (FLOP)', 'Power draw (W)', 'Training compute cost']


## Create a function to merge it into a text column

In [None]:
def generate_text(row):
  text = f"For the given model: {row['Model']},\n"
  for feature in obj_cols:
    if feature == 'Model':
      continue
    text += f"{feature} : {row[feature]},\n"
  text += f"The given output features is:\n"
  for feature in out_cols:
    text += f"{feature} : {row[feature]},\n"
  return text


In [None]:
df['text'] = df.progress_apply(generate_text,axis=1)

100%|██████████| 1705/1705 [00:00<00:00, 18594.00it/s]


In [None]:
df.drop(columns=['Abstract','Link','Reference'],inplace=True)

In [None]:
obj_cols = [col for col in df.select_dtypes(include="object").columns]
num_cols = [col for col in df.select_dtypes(include=np.number).columns if col not in out_cols]
print(obj_cols)
print(num_cols)
print(out_cols)

['Model', 'Domain', 'Country', 'Organization', 'Date', 'Category', 'Task', 'Confidence', 'accessibility', 'Training dataset', 'Training code accessibility', 'Training hardware', 'Notability', 'text']
['Hardware quantity', 'data size', 'Training time (hours)', 'Hardware unit Release Price (USD)', 'Hardware unit TDP (W)', 'Hardware unit Compute FLOPS', 'Hardware unit Compute OPS', 'Hardware unit Memory Size (bytes)', 'Epochs', 'Batch size', 'Hardware utilization', 'Expected power (W)', 'Training Energy', 'Expected Energy']
['Parameters', 'Training compute (FLOP)', 'Finetune compute (FLOP)', 'Power draw (W)', 'Training compute cost']


In [None]:
df = df[obj_cols + num_cols + out_cols]

In [None]:
df_new = df[['Model','text']].copy()
df_new.drop_duplicates(inplace=True)
df_new.shape

(1705, 2)

## Store the data

In [None]:
output_path_large_models = "/content/drive/MyDrive/DATA 6250/Datasets/Updated/Final/AI_models.csv"
if os.path.exists(output_path_large_models):
    os.remove(output_path_large_models)
path = os.path.dirname(output_path_large_models)
if not os.path.exists(path):
    os.makedirs(path)
df.to_csv(output_path_large_models, index=False)