In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
diamonds = pd.read_csv('./data/diamonds.csv')

In [3]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   x        40455 non-null  float64
 6   y        40455 non-null  float64
 7   z        40455 non-null  float64
 8   cut      40455 non-null  object 
 9   color    40455 non-null  object 
 10  clarity  40455 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


# Opción 1
- Eliminamos las filas de 0
- Eliminamos columna depth
- Reducimos dimensionalidad de city por continent
- Mantenemos el resto de columnas
- One hot enconding sobre todas

- Resultado
diamonds_01

In [4]:
diamonds_1 = diamonds.copy()

In [5]:
diamonds_1.drop(diamonds_1[(diamonds_1['x'] == 0) | (diamonds_1['y'] == 0) | (diamonds_1['z'] == 0)].index, inplace=True)

In [6]:
diamonds_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   city     40439 non-null  object 
 3   depth    40439 non-null  float64
 4   table    40439 non-null  float64
 5   x        40439 non-null  float64
 6   y        40439 non-null  float64
 7   z        40439 non-null  float64
 8   cut      40439 non-null  object 
 9   color    40439 non-null  object 
 10  clarity  40439 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


In [7]:
diamonds_1.drop('depth', axis=1, inplace=True)

In [59]:
city_map = {
    'Dubai': 'Asia',
    'Kimberly': 'Africa',
    'Las Vegas': 'America',
    'Tel Aviv': 'Asia',
    'Amsterdam': 'Europe',
    'Zurich': 'Europe',
    'Antwerp': 'Europe',
    'Madrid': 'Europe',
    'Paris': 'Europe',
    'Surat': 'Asia',
    'Luxembourg': 'Europe',
    'London': 'Europe',
    'New York City': 'America'
}

In [9]:
diamonds_1['continent'] = diamonds_1['city'].map(city_map)

In [10]:
diamonds_1.drop('city', axis=1, inplace=True)

In [11]:
diamonds_1 = pd.get_dummies(diamonds_1, drop_first=True, dtype=float)

In [12]:
diamonds_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   price              40439 non-null  int64  
 1   carat              40439 non-null  float64
 2   table              40439 non-null  float64
 3   x                  40439 non-null  float64
 4   y                  40439 non-null  float64
 5   z                  40439 non-null  float64
 6   cut_Good           40439 non-null  float64
 7   cut_Ideal          40439 non-null  float64
 8   cut_Premium        40439 non-null  float64
 9   cut_Very Good      40439 non-null  float64
 10  color_E            40439 non-null  float64
 11  color_F            40439 non-null  float64
 12  color_G            40439 non-null  float64
 13  color_H            40439 non-null  float64
 14  color_I            40439 non-null  float64
 15  color_J            40439 non-null  float64
 16  clarity_IF         40439 no

In [13]:
diamonds_1.to_csv('./data/diamonds_1.csv', index=False)

# Opción 2
- Eliminamos las filas de 0
- Eliminamos columna depth
- Eliminamos city
- Aplicamos label enconding sobre continent, clarity, cut y color

- Resultado
diamonds_02

In [14]:
diamonds_2 = diamonds.copy()

In [15]:
diamonds_2.drop(diamonds_2[(diamonds_2['x'] == 0) | (diamonds_2['y'] == 0) | (diamonds_2['z'] == 0)].index, inplace=True)

In [16]:
diamonds_2.drop('depth', axis=1, inplace=True)

In [17]:
diamonds_2.drop('city', axis=1, inplace=True)

In [56]:
cut_map = {'Fair': 1,
           'Good': 2,
           'Very Good': 3,
           'Premium': 4,
           'Ideal': 5}

In [19]:
diamonds_2['cut'] = diamonds_2['cut'].map(cut_map)

In [57]:
color_map = {'J': 1,
             'I': 2,
             'H': 3,
             'G': 4,
             'F': 5,
             'E': 6,
             'D': 7}

In [21]:
diamonds_2['color'] = diamonds_2['color'].map(color_map)

In [58]:
clarity_map = {'I1': 1,
               'SI2': 2,
               'SI1': 3,
               'VS2': 4,
               'VS1': 5,
               'VVS2': 6,
               'VVS1': 7,
               'IF': 8}

In [23]:
diamonds_2['clarity'] = diamonds_2['clarity'].map(clarity_map)

In [24]:
diamonds_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   table    40439 non-null  float64
 3   x        40439 non-null  float64
 4   y        40439 non-null  float64
 5   z        40439 non-null  float64
 6   cut      40439 non-null  int64  
 7   color    40439 non-null  int64  
 8   clarity  40439 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 3.1 MB


In [25]:
diamonds_2.to_csv('./data/diamonds_2.csv', index=False)

# Opción 3
- Reemplazamos las filas de 0 por mean
- Eliminamos columna depth
- Eliminamos city
- Aplicamos label enconding sobre continent, clarity, cut y color con otros valores

- Resultado
diamonds_03

In [87]:
diamonds_3 = diamonds.copy()

In [88]:
media_x = diamonds_3['x'].mean()
media_y = diamonds_3['y'].mean()
media_z = diamonds_3['z'].mean()

In [89]:
diamonds_3['x'] = diamonds_3['x'].replace(0, media_x)
diamonds_3['y'] = diamonds_3['y'].replace(0, media_y)
diamonds_3['z'] = diamonds_3['z'].replace(0, media_z)

In [90]:
diamonds_3.drop('depth', axis=1, inplace=True)

In [91]:
diamonds_3.drop('city', axis=1, inplace=True)

In [92]:
cut_map_2 = {'Fair': 10,
           'Good': 50,
           'Very Good': 60,
           'Premium': 80,
           'Ideal': 100}

In [93]:
diamonds_3['cut'] = diamonds_3['cut'].map(cut_map_2)

In [94]:
color_map_2 = {'J': 100,
             'I': 200,
             'H': 300,
             'G': 400,
             'F': 500,
             'E': 600,
             'D': 700}

In [95]:
diamonds_3['color'] = diamonds_3['color'].map(color_map)

In [96]:
clarity_map = {'I1': 1,
               'SI2': 2,
               'SI1': 3,
               'VS2': 4,
               'VS1': 5,
               'VVS2': 6,
               'VVS1': 7,
               'IF': 8}

In [97]:
diamonds_3['clarity'] = diamonds_3['clarity'].map(clarity_map)

In [98]:
diamonds_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   table    40455 non-null  float64
 3   x        40455 non-null  float64
 4   y        40455 non-null  float64
 5   z        40455 non-null  float64
 6   cut      40455 non-null  int64  
 7   color    40455 non-null  int64  
 8   clarity  40455 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 2.8 MB


In [99]:
diamonds_3.to_csv('./data/diamonds_3.csv', index=False)

# Opción 4
- Eliminamos las filas de 0
- Eliminamos columna depth
- Nueva columna de zie
- Eliminamos columnas x y z 
- Eliminamos city
- Label enconding de color, clarity, cut 

- Resultado
diamonds_04

In [60]:
diamonds_4 = diamonds.copy()

In [61]:
diamonds_4.drop(diamonds_4[(diamonds_4['x'] == 0) | (diamonds_4['y'] == 0) | (diamonds_4['z'] == 0)].index, inplace=True)

In [62]:
diamonds_4.drop('depth', axis=1, inplace=True)

In [63]:
diamonds_4["size"] = diamonds_4["x"] * diamonds_4["y"] * diamonds_4["z"]

In [64]:
diamonds_4 = diamonds_4.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [65]:
diamonds_4.drop('city', axis=1, inplace=True)

In [66]:
diamonds_4['cut'] = diamonds_4['cut'].map(cut_map)

In [67]:
diamonds_4['clarity'] = diamonds_4['clarity'].map(clarity_map)

In [68]:
diamonds_4['color'] = diamonds_4['color'].map(color_map)

In [69]:
diamonds_4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   table    40439 non-null  float64
 3   cut      40439 non-null  int64  
 4   color    40439 non-null  int64  
 5   clarity  40439 non-null  int64  
 6   size     40439 non-null  float64
dtypes: float64(3), int64(4)
memory usage: 2.5 MB


In [70]:
diamonds_4.to_csv('./data/diamonds_4.csv', index=False)

# Opción 5
- Eliminamos las filas de 0
- Eliminamos columna depth
- Nueva columna de zie
- Eliminamos columnas x y z 
- Eliminamos columna de city
- One hot enconding

- Resultado
diamonds_05

In [71]:
diamonds_5 = diamonds.copy()

In [72]:
diamonds_5.drop(diamonds_5[(diamonds_5['x'] == 0) | (diamonds_5['y'] == 0) | (diamonds_5['z'] == 0)].index, inplace=True)

In [73]:
diamonds_5.drop('depth', axis=1, inplace=True)

In [74]:
diamonds_5["size"] = diamonds_5["x"] * diamonds_5["y"] * diamonds_5["z"]

In [75]:
diamonds_5 = diamonds_5.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [76]:
diamonds_5.drop('city', axis=1, inplace=True)

In [77]:
diamonds_5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   table    40439 non-null  float64
 3   cut      40439 non-null  object 
 4   color    40439 non-null  object 
 5   clarity  40439 non-null  object 
 6   size     40439 non-null  float64
dtypes: float64(3), int64(1), object(3)
memory usage: 2.5+ MB


In [78]:
diamonds_5 = pd.get_dummies(diamonds_5, drop_first=True, dtype=float)

In [80]:
diamonds_5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          40439 non-null  int64  
 1   carat          40439 non-null  float64
 2   table          40439 non-null  float64
 3   size           40439 non-null  float64
 4   cut_Good       40439 non-null  float64
 5   cut_Ideal      40439 non-null  float64
 6   cut_Premium    40439 non-null  float64
 7   cut_Very Good  40439 non-null  float64
 8   color_E        40439 non-null  float64
 9   color_F        40439 non-null  float64
 10  color_G        40439 non-null  float64
 11  color_H        40439 non-null  float64
 12  color_I        40439 non-null  float64
 13  color_J        40439 non-null  float64
 14  clarity_IF     40439 non-null  float64
 15  clarity_SI1    40439 non-null  float64
 16  clarity_SI2    40439 non-null  float64
 17  clarity_VS1    40439 non-null  float64
 18  clarity_VS2

In [81]:
diamonds_5.to_csv('./data/diamonds_5.csv', index=False)

# Opción 6
- Eliminamos las filas de 0
- Eliminamos columna depth
- Nueva columna de zie
- Eliminamos columnas x y z 
- Label enconding sobre color, clarity y cut
- One hot enconding por city

- Resultado
diamonds_06

In [125]:
diamonds_6 = diamonds.copy()

In [126]:
diamonds_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   x        40455 non-null  float64
 6   y        40455 non-null  float64
 7   z        40455 non-null  float64
 8   cut      40455 non-null  object 
 9   color    40455 non-null  object 
 10  clarity  40455 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [127]:
diamonds_6.drop(diamonds_6[(diamonds_6['x'] == 0) | (diamonds_6['y'] == 0) | (diamonds_6['z'] == 0)].index, inplace=True)

In [128]:
diamonds_6.drop('depth', axis=1, inplace=True)

In [129]:
diamonds_6["size"] = diamonds_6["x"] * diamonds_6["y"] * diamonds_6["z"]

In [130]:
diamonds_6 = diamonds_6.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [131]:
diamonds_6['cut'] = diamonds_6['cut'].map(cut_map)

In [132]:
diamonds_6['clarity'] = diamonds_6['clarity'].map(clarity_map)

In [133]:
diamonds_6['color'] = diamonds_6['color'].map(color_map)

In [134]:
diamonds_6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   city     40439 non-null  object 
 3   table    40439 non-null  float64
 4   cut      40439 non-null  int64  
 5   color    40439 non-null  int64  
 6   clarity  40439 non-null  int64  
 7   size     40439 non-null  float64
dtypes: float64(3), int64(4), object(1)
memory usage: 2.8+ MB


In [135]:
diamonds_6 = pd.get_dummies(diamonds_6, drop_first=True, dtype=float)

In [136]:
diamonds_6.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               40439 non-null  int64  
 1   carat               40439 non-null  float64
 2   table               40439 non-null  float64
 3   cut                 40439 non-null  int64  
 4   color               40439 non-null  int64  
 5   clarity             40439 non-null  int64  
 6   size                40439 non-null  float64
 7   city_Antwerp        40439 non-null  float64
 8   city_Dubai          40439 non-null  float64
 9   city_Kimberly       40439 non-null  float64
 10  city_Las Vegas      40439 non-null  float64
 11  city_London         40439 non-null  float64
 12  city_Luxembourg     40439 non-null  float64
 13  city_Madrid         40439 non-null  float64
 14  city_New York City  40439 non-null  float64
 15  city_Paris          40439 non-null  float64
 16  city_Sura

In [137]:
diamonds_6.to_csv('./data/diamonds_6.csv', index=False)

# Opción 7
- Eliminamos las filas de 0
- Eliminamos columna de city
- Label enconding sobre cut, color y clarity

- Resultado
diamonds_07

In [100]:
diamonds_7 = diamonds.copy()

In [101]:
diamonds_7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   x        40455 non-null  float64
 6   y        40455 non-null  float64
 7   z        40455 non-null  float64
 8   cut      40455 non-null  object 
 9   color    40455 non-null  object 
 10  clarity  40455 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [140]:
diamonds_7.drop(diamonds_7[(diamonds_7['x'] == 0) | (diamonds_7['y'] == 0) | (diamonds_7['z'] == 0)].index, inplace=True)

In [141]:
diamonds_7.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   city     40439 non-null  object 
 3   depth    40439 non-null  float64
 4   table    40439 non-null  float64
 5   x        40439 non-null  float64
 6   y        40439 non-null  float64
 7   z        40439 non-null  float64
 8   cut      40439 non-null  object 
 9   color    40439 non-null  object 
 10  clarity  40439 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.7+ MB


In [142]:
diamonds_7.drop('city', axis=1, inplace=True)

In [143]:
diamonds_7['cut'] = diamonds_7['cut'].map(cut_map)

In [144]:
diamonds_7['clarity'] = diamonds_7['clarity'].map(clarity_map)

In [145]:
diamonds_7['color'] = diamonds_7['color'].map(color_map)

In [146]:
diamonds_7.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40439 non-null  int64  
 1   carat    40439 non-null  float64
 2   depth    40439 non-null  float64
 3   table    40439 non-null  float64
 4   x        40439 non-null  float64
 5   y        40439 non-null  float64
 6   z        40439 non-null  float64
 7   cut      40439 non-null  int64  
 8   color    40439 non-null  int64  
 9   clarity  40439 non-null  int64  
dtypes: float64(6), int64(4)
memory usage: 3.4 MB


In [147]:
diamonds_7.to_csv('./data/diamonds_7.csv', index=False)

# Opción 8
- Eliminamos las filas de 0
- Eliminamos columna depth
- Nueva columna de size
- Eliminamos columnas x y z 
- Reducimos la dimensionalidad de city por continent
- One hot enconding

- Resultado
diamonds_03

In [68]:
diamonds_8 = diamonds.copy()

In [71]:
diamonds_8["size"] = diamonds_8["x"] * diamonds_8["y"] * diamonds_8["z"]

In [69]:
diamonds_8.drop(diamonds_8[(diamonds_8['x'] == 0) | (diamonds_8['y'] == 0) | (diamonds_8['z'] == 0)].index, inplace=True)

In [72]:
diamonds_8 = diamonds_8.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [70]:
diamonds_8.drop('depth', axis=1, inplace=True)

In [73]:
diamonds_8['continent'] = diamonds_8['city'].map(city_map)

In [76]:
diamonds_8 = pd.get_dummies(diamonds_8, drop_first=True, dtype=float)

In [77]:
diamonds_8.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40439 entries, 0 to 40454
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               40439 non-null  int64  
 1   carat               40439 non-null  float64
 2   table               40439 non-null  float64
 3   size                40439 non-null  float64
 4   city_Antwerp        40439 non-null  float64
 5   city_Dubai          40439 non-null  float64
 6   city_Kimberly       40439 non-null  float64
 7   city_Las Vegas      40439 non-null  float64
 8   city_London         40439 non-null  float64
 9   city_Luxembourg     40439 non-null  float64
 10  city_Madrid         40439 non-null  float64
 11  city_New York City  40439 non-null  float64
 12  city_Paris          40439 non-null  float64
 13  city_Surat          40439 non-null  float64
 14  city_Tel Aviv       40439 non-null  float64
 15  city_Zurich         40439 non-null  float64
 16  cut_Good 

In [78]:
diamonds_8.to_csv('./data/diamonds_8.csv', index=False)

# Opción 9
- Reemplazamos los valores por la media
- Eliminamos columna de city
- Label enconding sobre cut, color y clarity con los segundos valores

- Resultado
diamonds_09

In [102]:
diamonds_9 = diamonds.copy()

In [103]:
diamonds_9['x'] = diamonds_9['x'].replace(0, media_x)
diamonds_9['y'] = diamonds_9['y'].replace(0, media_y)
diamonds_9['z'] = diamonds_9['z'].replace(0, media_z)

In [104]:
diamonds_9.drop('city', axis=1, inplace=True)

In [105]:
diamonds_9['cut'] = diamonds_9['cut'].map(cut_map_2)

In [106]:
diamonds_9['clarity'] = diamonds_9['clarity'].map(clarity_map)

In [107]:
diamonds_9['color'] = diamonds_9['color'].map(color_map_2)

In [109]:
diamonds_9.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   depth    40455 non-null  float64
 3   table    40455 non-null  float64
 4   x        40455 non-null  float64
 5   y        40455 non-null  float64
 6   z        40455 non-null  float64
 7   cut      40455 non-null  int64  
 8   color    40455 non-null  int64  
 9   clarity  40455 non-null  int64  
dtypes: float64(6), int64(4)
memory usage: 3.1 MB


In [110]:
diamonds_9.to_csv('./data/diamonds_9.csv', index=False)