# Diamonds TEST analysis & cleaning

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
diamonds_test = pd.read_csv('input/diamonds_test.csv', encoding='latin-1')                           

## Diamonds_train analysis and cleaning

In [3]:
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Very Good,I,IF,62.0,58.0,4.44,4.46,2.76
1,1,1.21,Very Good,D,SI2,62.4,58.0,6.77,6.83,4.24
2,2,1.06,Very Good,D,SI1,59.3,60.0,6.64,6.71,3.96
3,3,0.36,Ideal,E,VVS1,61.4,57.0,4.64,4.61,2.54
4,4,0.7,Ideal,E,VS1,62.3,54.0,5.67,5.72,3.55


- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour, from J (worst) to D (best)
- clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

In [None]:
diamonds_test.shape

In [5]:
# No hay valores nulos

diamonds_test.isna().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [6]:
diamonds_test.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [7]:
# Estudio las columnas con valores "object"
# Quality of the cut (Fair, Good, Very Good, Premium, Ideal)

diamonds_test["cut"].value_counts()

Ideal        5301
Premium      3449
Very Good    3056
Good         1267
Fair          412
Name: cut, dtype: int64

In [8]:
# Diamond colour, from J (worst) to D (best)

diamonds_test["color"].value_counts()

G    2727
E    2453
F    2380
H    2157
D    1712
I    1370
J     686
Name: color, dtype: int64

In [9]:
# Clarity: a measurement of how clear the diamond is 
# I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)

diamonds_test["clarity"].value_counts()

SI1     3259
VS2     3054
SI2     2329
VS1     2072
VVS2    1288
VVS1     879
IF       425
I1       179
Name: clarity, dtype: int64

## Numerical data

In [10]:
diamonds_test.describe()

Unnamed: 0,id,carat,depth,table,x,y,z
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.797397,61.762225,57.473059,5.730977,5.73236,3.538879
std,3892.928525,0.471413,1.427672,2.260095,1.11698,1.10938,0.690877
min,0.0,0.2,51.0,44.0,0.0,0.0,0.0
25%,3371.0,0.4,61.1,56.0,4.72,4.72,2.91
50%,6742.0,0.7,61.9,57.0,5.69,5.7,3.53
75%,10113.0,1.04,62.5,59.0,6.53,6.53,4.03
max,13484.0,4.01,73.6,95.0,10.14,10.1,6.17


In [11]:
#Elimino la columna del id pq no aporta infomación en este caso

diamonds_test.drop(columns = 'id', axis = 1, inplace = True )
diamonds_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.33,Very Good,I,IF,62.0,58.0,4.44,4.46,2.76
1,1.21,Very Good,D,SI2,62.4,58.0,6.77,6.83,4.24
2,1.06,Very Good,D,SI1,59.3,60.0,6.64,6.71,3.96
3,0.36,Ideal,E,VVS1,61.4,57.0,4.64,4.61,2.54
4,0.7,Ideal,E,VS1,62.3,54.0,5.67,5.72,3.55


In [12]:
# Convertimos los datos categóricos en numéricos

In [13]:
diamonds_test_dummies = pd.get_dummies(diamonds_test)
diamonds_test_dummies.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.33,62.0,58.0,4.44,4.46,2.76,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,1.21,62.4,58.0,6.77,6.83,4.24,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1.06,59.3,60.0,6.64,6.71,3.96,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.36,61.4,57.0,4.64,4.61,2.54,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0.7,62.3,54.0,5.67,5.72,3.55,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [14]:
diamonds_test_dummies.to_csv("output/diamonds_test_dummies.csv",index=False)