In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/66e31d6ee96cd_student_resource_3/student_resource 3/dataset/train.csv'
data = pd.read_csv(file_path)

# Extract numerical value and unit from the 'entity_value' column
data[['numeric_value', 'unit']] = data['entity_value'].str.extract(r'([0-9.]+)\s*(\w+)')

# Convert the 'numeric_value' to float for proper numerical operations
data['numeric_value'] = pd.to_numeric(data['numeric_value'], errors='coerce')

# Function to convert all units to a base unit
def convert_to_base_unit(row):
    value = row['numeric_value']
    unit = row['unit'].lower()

    # Standardize weight to grams
    if 'gram' in unit:
        return value  # Already in grams
    elif 'milligram' in unit:
        return value / 1000  # Convert mg to grams
    elif 'kilogram' in unit:
        return value * 1000  # Convert kg to grams
    # Standardize volume to liters
    elif 'milliliter' in unit or 'ml' in unit:
        return value / 1000  # Convert ml to liters
    elif 'liter' in unit:
        return value  # Already in liters
    elif 'cup' in unit:
        return value * 0.24  # Convert cups to liters (approx)
    elif 'ounce' in unit or 'oz' in unit:
        return value * 0.0295735  # Convert ounces to liters (fluid oz)
    # Handle unknown units by returning the original value
    else:
        return value

# Apply the function to convert units
data['standardized_value'] = data.apply(convert_to_base_unit, axis=1)

# Now that the units are standardized, we can drop the original numeric_value and unit columns
data_cleaned = data.drop(columns=['numeric_value', 'unit'])

# Display the cleaned dataset
data_cleaned.head()


Unnamed: 0,image_link,group_id,entity_name,entity_value,standardized_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,500.0
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,0.24
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,0.709
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,0.709
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,1400.0


In [None]:
# prompt: print various types  of groud_id and entity_name are there

print(data_cleaned['group_id'].unique())
print(data_cleaned['entity_name'].unique())
print(f"Number of unique ground_id values: {len(data_cleaned['group_id'].unique())}")
print(f"Number of unique entity_name values: {len(data_cleaned['entity_name'].unique())}")


[748919 916768 459516 731432 149159 308856 281678 252585 299791 884560
 179080 866516 524635 730429 881883 601746 487566 794161 639090 752266
 237000 844474 709627 523149 630390 810266 993359 529606 681445 365637
 767202 369753 630869 558374 254449 416664 241608 308671 442321 991868
 208023 593600 908443 893692 120569 564709 507988 599772 483370 983323
 271537 589105 641642 311997 318770 955292 150913 675317 801829 275506
 297918 507619 953031 387046 433914 922709 648011 957185 654649 375816
 178958 746096 749917 932012 396159 477578 917343 359859 273748 563130
 479564 625310 334327 120219 965518 449021 373285 211213 750220 489118
 978900 132401 226504 446789 225091 192132 249638 486636 252782 386873
 329793 969033 140266 145452 276700 558832 181357 658003 898898 204245
 347404 929999 628971 130591 557758 411423 107694 426261 397856 751532
 928606 934747 701880 152057 611510 296366 186035 918474 507848 501250
 488883 549052 267482 245959 507467 412008 926285 939587 519155 550840
 84722

In [None]:
# prompt: print the number of each type of group_id

print(data_cleaned['group_id'].value_counts())


group_id
459516    9458
752266    9063
281678    6137
308856    5437
731432    4741
          ... 
997333       2
656506       2
314298       2
178031       1
226428       1
Name: count, Length: 750, dtype: int64


In [None]:
data_cleaned['group_id'].value_counts().head(100)

Unnamed: 0_level_0,count
group_id,Unnamed: 1_level_1
459516,9458
752266,9063
281678,6137
308856,5437
731432,4741
...,...
929999,624
494658,622
861555,622
846116,622


In [None]:
# prompt: print the group  id's whose counts are lless than 10

group_counts = data_cleaned['group_id'].value_counts()
print(len(group_counts[group_counts < 10].index.tolist()))
# print(group_ids_less_than_10)


28


In [None]:
# prompt: print the group  id's whose counts are lless than 10

group_counts = data_cleaned['group_id'].value_counts()
print(len(group_counts[group_counts < 100].index.tolist()))
# print(group_ids_less_than_10)


366


In [None]:
print(data_cleaned['entity_name'].value_counts())


entity_name
item_weight                      102786
depth                             45127
width                             44183
height                            43597
voltage                            9466
wattage                            7755
item_volume                        7682
maximum_weight_recommendation      3263
Name: count, dtype: int64


In [None]:
# prompt: download all the data of image links from the column 'image_link in a different folder, and keep the name of the image as the link name

import pandas as pd
import requests
import os

# Create a new directory to store the images
!mkdir downloaded_images

# Function to download images
def download_image(image_url, save_path):
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Image downloaded successfully: {save_path}")
        else:
            print(f"Failed to download image: {image_url}")
    except Exception as e:
        print(f"Error downloading image: {e}")

# Iterate over the DataFrame and download images
for index, row in data_cleaned.iterrows():
    image_url = row['image_link']
    if pd.notna(image_url):
        # Extract the image name from the URL
        image_name = image_url.split('/')[-1]
        save_path = os.path.join('/content/downloaded_images', image_name)
        download_image(image_url, save_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image downloaded successfully: /content/downloaded_images/51LeFeNIATL.jpg
Image downloaded successfully: /content/downloaded_images/618xvKXPosL.jpg
Image downloaded successfully: /content/downloaded_images/616EC7RHG9L.jpg
Image downloaded successfully: /content/downloaded_images/619DqA7AiiL.jpg
Image downloaded successfully: /content/downloaded_images/51dtmp5tNZL.jpg
Image downloaded successfully: /content/downloaded_images/51-hLICC6uL.jpg
Image downloaded successfully: /content/downloaded_images/51DV8d1M45L.jpg
Image downloaded successfully: /content/downloaded_images/51srYEkTmmL.jpg
Image downloaded successfully: /content/downloaded_images/51Q0TGVkSyL.jpg
Image downloaded successfully: /content/downloaded_images/51HNYfRI9hL.jpg
Image downloaded successfully: /content/downloaded_images/51u5hPBsB0L.jpg
Image downloaded successfully: /content/downloaded_images/51nm+4uioiL.jpg
Image downloaded successfully: /content/downloa

In [None]:
# prompt: upload all the files from /content/downloaded_images  directory to /content/drive/MyDrive

!cp -r /content/downloaded_images/* /content/drive/MyDrive/downloaded_images/


/bin/bash: line 1: /usr/bin/cp: Argument list too long


In [1]:
# prompt: load /content/drive/MyDrive/66e31d6ee96cd_student_resource_3/student_resource 3/dataset/test.csv in dataframe

import pandas as pd
# Load the test dataset
test_file_path = '/content/drive/MyDrive/66e31d6ee96cd_student_resource_3/student_resource 3/dataset/test.csv'
test_data = pd.read_csv(test_file_path)


In [2]:
test_data.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [4]:
# prompt: print all the categories count in column entity_name

print(test_data['entity_name'].value_counts())


entity_name
height                           32282
depth                            28146
width                            26931
item_weight                      22032
maximum_weight_recommendation     7028
voltage                           5488
wattage                           5447
item_volume                       3833
Name: count, dtype: int64


In [7]:

# Create a dictionary to store dataframes for each entity_name
entity_dataframes = {}

# Iterate over unique entity_names
for entity_name in test_data['entity_name'].unique():
  # Create a new dataframe for the current entity_name
  entity_dataframes[entity_name] = test_data[test_data['entity_name'] == entity_name]

# Now you have a dictionary where keys are entity_names and values are corresponding dataframes


{'height':          index                                         image_link  group_id  \
 0            0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
 2            2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
 5            5  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   
 7            7  https://m.media-amazon.com/images/I/11lshEUmCr...    156839   
 9            9  https://m.media-amazon.com/images/I/21-LmSmehZ...    478357   
 ...        ...                                                ...       ...   
 108311  108389  https://m.media-amazon.com/images/I/61GKsf-LQq...    931247   
 108313  108391  https://m.media-amazon.com/images/I/61GKwgK375...    970563   
 108314  108392  https://m.media-amazon.com/images/I/61GL80-3yF...    521308   
 108317  108395  https://m.media-amazon.com/images/I/61GLIRKeeY...    658003   
 108323  108401  https://m.media-amazon.com/images/I/61GLM10yY8...    483370   
 
        entity_name  
 0    

In [8]:
entity_dataframes


{'height':          index                                         image_link  group_id  \
 0            0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
 2            2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
 5            5  https://m.media-amazon.com/images/I/11gHj8dhhr...    792578   
 7            7  https://m.media-amazon.com/images/I/11lshEUmCr...    156839   
 9            9  https://m.media-amazon.com/images/I/21-LmSmehZ...    478357   
 ...        ...                                                ...       ...   
 108311  108389  https://m.media-amazon.com/images/I/61GKsf-LQq...    931247   
 108313  108391  https://m.media-amazon.com/images/I/61GKwgK375...    970563   
 108314  108392  https://m.media-amazon.com/images/I/61GL80-3yF...    521308   
 108317  108395  https://m.media-amazon.com/images/I/61GLIRKeeY...    658003   
 108323  108401  https://m.media-amazon.com/images/I/61GLM10yY8...    483370   
 
        entity_name  
 0    

In [9]:

# Find duplicate image links
duplicate_links = test_data[test_data['image_link'].duplicated(keep=False)]

# Group by image link and get the entity names
grouped_duplicates = duplicate_links.groupby('image_link')['entity_name'].apply(list).reset_index()

# Print the results
for index, row in grouped_duplicates.iterrows():
  print(f"{row['image_link']} : {row['entity_name']}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://m.media-amazon.com/images/I/61EOZ3MsAQL.jpg : ['depth', 'width']
https://m.media-amazon.com/images/I/61EOqo6TGXL.jpg : ['depth', 'width']
https://m.media-amazon.com/images/I/61EP0x9RMDL.jpg : ['height', 'width']
https://m.media-amazon.com/images/I/61EP1zjvCbL.jpg : ['height', 'depth']
https://m.media-amazon.com/images/I/61EP41UP1tL.jpg : ['width', 'depth']
https://m.media-amazon.com/images/I/61EP6Ath-rL.jpg : ['height', 'depth', 'width']
https://m.media-amazon.com/images/I/61EQEETAz3L.jpg : ['depth', 'width', 'height']
https://m.media-amazon.com/images/I/61EQQgB-fbL.jpg : ['depth', 'height']
https://m.media-amazon.com/images/I/61EQnMCupwL.jpg : ['height', 'width', 'depth']
https://m.media-amazon.com/images/I/61ERPAMWNSL.jpg : ['width', 'depth', 'height']
https://m.media-amazon.com/images/I/61ERYeGW-6L.jpg : ['height', 'depth', 'width']
https://m.media-amazon.com/images/I/61ERfT1T8iL.jpg : ['depth', 'height']
https

In [11]:
# prompt: make a csv for the group duplicates dataframe to download

grouped_duplicates.to_csv('duplicate_image_links.csv', index=False)


In [12]:
# prompt: download the duplicate_image_links.csv

from google.colab import files
files.download('duplicate_image_links.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
# prompt: load duplicate_image_links in a dataframe

import pandas as pd

# Load the CSV file into a DataFrame
duplicate_image_links_df = pd.read_csv('duplicate_image_links.csv')

# Display the DataFrame
duplicate_image_links_df

Unnamed: 0,image_link,entity_name
0,https://m.media-amazon.com/images/I/11TU2clswz...,"['width', 'height', 'depth']"
1,https://m.media-amazon.com/images/I/11gHj8dhhr...,"['depth', 'height', 'width']"
2,https://m.media-amazon.com/images/I/214CLs1ozn...,"['depth', 'height', 'width']"
3,https://m.media-amazon.com/images/I/218BCzgKxu...,"['item_weight', 'wattage', 'voltage']"
4,https://m.media-amazon.com/images/I/21GLFXwC1m...,"['width', 'depth']"
...,...,...
33994,https://m.media-amazon.com/images/I/91zGFUNJCb...,"['wattage', 'item_weight']"
33995,https://m.media-amazon.com/images/I/91zWoKcfiA...,"['maximum_weight_recommendation', 'item_weight']"
33996,https://m.media-amazon.com/images/I/A1KuAtMexv...,"['item_weight', 'maximum_weight_recommendation']"
33997,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,"['item_weight', 'maximum_weight_recommendation']"


In [15]:
test_data.head()


Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [19]:
len(test_data['image_link'].unique())

90666

In [20]:
duplicate_image_links_df.shape

(33999, 2)

In [21]:
# prompt: in test_df can you sort the dataframe, and keep the rows sorted in according to the image_link column

test_data_sorted = test_data.sort_values(by=['image_link'])


In [22]:
test_data_sorted.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [23]:
# prompt: make a csv for the group duplicates dataframe to download

test_data_sorted.to_csv('test_data_sorted.csv', index=False)
# prompt: download the duplicate_image_links.csv

from google.colab import files
files.download('test_data_sorted.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>