# Import packages

In [1]:
import requests
# import json
import pandas as pd

In [13]:
df = pd.read_csv('diamonds.csv')

In [111]:
url = 'https://www.brilliantearth.com/loose-diamonds/list/?shapes=Round&min_carat=0.71&max_carat=0.75&page=1'

In [112]:
response = requests.get(url).json()

In [113]:
print(f'total_count: {response["total_count"]}')
print(f'page: {response["page"]}')
print(f'num_records: {len(response["diamonds"])}')
print(f'path: {response["path"]}')

total_count: 1070
page: 1
num_records: 20
path: ?shapes=Round&min_carat=0.71&max_carat=0.75&page=1


In [114]:
class Diamonds:
    def __init__(self, page_num=1, min_carat=0.71, max_carat=0.75):

        self.page_num = page_num
        self.min_carat = min_carat
        self.max_carat = max_carat
        
        self.url = 'https://www.brilliantearth.com/loose-diamonds/list/'
        self.num_records = 0
        
        self.diamond_id = []
        self.carat = []
        self.color = []
        self.clarity = []
        self.cut = []
        self.price = []

    
    def get_diamonds(self):
        
        path = f'?shapes=Round&min_carat={self.min_carat}&max_carat={self.min_carat}&page={self.page_num}'
        full_path = self.url + path
        
        response = requests.get(full_path).json()

        for diamond in response['diamonds']:
            self.diamond_id.append(diamond['id'])
            self.carat.append(diamond['carat'])
            self.color.append(diamond['color'])
            self.clarity.append(diamond['clarity'])
            self.cut.append(diamond['cut'])
            self.price.append(diamond['price'])
            self.num_records += 1
            
        if response['total_count'] % 20 == 0:
            
            while self.page_num < (response['total_count'] / 20):
                print(f'Page {self.page_num} done.')
                self.page_num += 1
                self.get_diamonds()
                
            print(f'***** Retrived {self.num_records:,} diamonds of {self.min_carat:.2f} carat weight *****')

        else:
            
            if len(response['diamonds']) == 20:
                print(f'Page {self.page_num} done.')
                self.page_num += 1
                self.get_diamonds()
                
            print(f'***** Retrived {self.num_records:,} diamonds of {self.min_carat:.2f} carat weight *****')

        if self.min_carat < self.max_carat:
            self.min_carat += 0.01
            self.page_num = 1
            self.get_diamonds()

            
    def create_frame(self):
        
        data = {
            'diamond_id': self.diamond_id,
            'carat': self.carat,
            'color': self.color,
            'clarity':self.clarity,
            'cut': self.cut,
            'price': self.price
        }
        
        return pd.DataFrame(data)
    

In [115]:
bling = Diamonds()

In [116]:
%%time
bling.get_diamonds()

Page 1 done.
Page 2 done.
Page 3 done.
Page 4 done.
Page 5 done.
Page 6 done.
Page 7 done.
Page 8 done.
Page 9 done.
Page 10 done.
Page 11 done.
Page 12 done.
Page 13 done.
Page 14 done.
Page 15 done.
Page 16 done.
Page 17 done.
***** Retrived 356 diamonds of 0.71 carat weight *****
Page 1 done.
Page 2 done.
Page 3 done.
Page 4 done.
Page 5 done.
Page 6 done.
Page 7 done.
Page 8 done.
Page 9 done.
Page 10 done.
***** Retrived 570 diamonds of 0.72 carat weight *****
Page 1 done.
Page 2 done.
Page 3 done.
Page 4 done.
Page 5 done.
***** Retrived 685 diamonds of 0.73 carat weight *****
Page 1 done.
***** Retrived 725 diamonds of 0.74 carat weight *****
Page 1 done.
Page 2 done.
Page 3 done.
Page 4 done.
Page 5 done.
Page 6 done.
Page 7 done.
Page 8 done.
Page 9 done.
Page 10 done.
Page 11 done.
Page 12 done.
Page 13 done.
Page 14 done.
Page 15 done.
Page 16 done.
Page 17 done.
***** Retrived 1,070 diamonds of 0.75 carat weight *****
***** Retrived 1,070 diamonds of 0.75 carat weight *****

In [117]:
len(bling.diamond_id)

1070

In [118]:
diamonds = bling.create_frame()

In [119]:
diamonds.head()

Unnamed: 0,diamond_id,carat,color,clarity,cut,price
0,10215744,0.71,I,SI2,Very Good,1530
1,10182297,0.71,G,SI2,Very Good,1730
2,10237740,0.71,I,SI2,Super Ideal,1740
3,10163703,0.71,I,SI2,Super Ideal,1800
4,10240138,0.71,E,SI2,Good,1830


In [120]:
diamonds.tail()

Unnamed: 0,diamond_id,carat,color,clarity,cut,price
1065,10205723,0.75,D,IF,Super Ideal,6370
1066,9919150,0.75,D,IF,Super Ideal,6600
1067,9974979,0.75,D,IF,Super Ideal,6630
1068,10010913,0.75,D,IF,Super Ideal,6980
1069,9873268,0.75,D,FL,Super Ideal,7610


In [121]:
len(diamonds['diamond_id'].unique())

1070

In [123]:
len(df)

40751

In [124]:
df = pd.concat(
    [df, diamonds],
    ignore_index=True
)

In [125]:
len(df)

41821

In [126]:
df.to_csv('diamonds.csv', index=False)

In [215]:
# diamonds.duplicated(subset='upc')
diamonds[diamonds.duplicated(['diamond_id'], keep=False)]

Unnamed: 0,diamond_id,carat,color,clarity,cut,price
259,9563097,0.25,F,VS1,Super Ideal,620
261,9563097,0.25,F,VS1,Super Ideal,620
4270,10160929,0.30,D,VS1,Very Good,780
4271,10160929,0.30,D,VS1,Very Good,780
17914,10205003,0.37,D,VS1,Ideal,1110
...,...,...,...,...,...,...
58354,10238504,0.43,J,VVS1,Super Ideal,990
58355,9588287,0.43,E,SI1,Very Good,1000
58356,9966456,0.43,D,SI2,Super Ideal,1010
58357,10186043,0.43,J,VVS1,Super Ideal,1010


In [316]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10458 entries, 0 to 10457
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   diamond_id  10458 non-null  int64  
 1   carat       10458 non-null  float64
 2   color       10458 non-null  object 
 3   clarity     10458 non-null  object 
 4   cut         10458 non-null  object 
 5   price       10458 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 490.3+ KB


In [199]:
df.tail()

Unnamed: 0,diamond_id,carat,color,clarity,cut,price
25984,10191979,0.49,F,VS1,Very Good,1940
25985,10223274,0.49,E,VVS1,Super Ideal,2040
25986,10099446,0.49,D,VS2,Super Ideal,2170
25987,9598921,0.49,D,VVS1,Super Ideal,2230
25988,9951008,0.49,D,IF,Super Ideal,2780


In [321]:
df3 = pd.concat(
    [df, df2],
    ignore_index=True
)

In [322]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36447 entries, 0 to 36446
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   diamond_id  36447 non-null  int64  
 1   carat       36447 non-null  float64
 2   color       36447 non-null  object 
 3   clarity     36447 non-null  object 
 4   cut         36447 non-null  object 
 5   price       36447 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 1.7+ MB


In [324]:
df3.tail()

Unnamed: 0,diamond_id,carat,color,clarity,cut,price
36442,10236892,0.58,E,VVS1,Super Ideal,3850
36443,9922838,0.58,D,IF,Super Ideal,3980
36444,9965270,0.58,D,IF,Super Ideal,4310
36445,10209495,0.58,D,FL,Super Ideal,4410
36446,10020378,0.58,D,FL,Super Ideal,5030
