In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('train.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
display(df.head(3))

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.01,Very Good,E,SI2,60.0,60.0,4540,6.57,6.49,3.92
1,1.1,Premium,H,VS2,62.5,58.0,5729,6.59,6.54,4.1
2,1.5,Good,E,SI2,61.5,65.0,6300,7.21,7.17,4.42


In [3]:
df.isna().sum()
df.describe()


Unnamed: 0,carat,depth,table,price,x,y,z
count,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0,43154.0
mean,0.799047,61.742925,57.45901,3946.777054,5.733798,5.737574,3.539338
std,0.475214,1.42841,2.227191,3998.657385,1.123004,1.150325,0.696203
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,953.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2406.5,5.7,5.71,3.53
75%,1.04,62.5,59.0,5367.0,6.54,6.54,4.04
max,4.5,79.0,79.0,18823.0,10.23,58.9,8.06


In [4]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(Q1 - 1.5 * IQR)
print(Q3 + 1.5 * IQR)

carat      -0.560
depth      58.750
table      51.500
price   -5668.000
x           1.965
y           1.990
z           1.215
dtype: float64
carat        2.000
depth       64.750
table       63.500
price    11988.000
x            9.285
y            9.270
z            5.735
dtype: float64


In [5]:
df.drop(df.query("z>  5.735 or y>9.270 or x>9.285").index, inplace=True)
df.drop(df.query("z<  1.215 or y<1.99 or x<1.965").index, inplace=True)

In [6]:
df['vol'] = df.x * df.y * df.z
df.drop(['x','y','z'], axis=1, inplace=True)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,vol
0,1.01,Very Good,E,SI2,60.0,60.0,4540,167.146056
1,1.1,Premium,H,VS2,62.5,58.0,5729,176.70426
2,1.5,Good,E,SI2,61.5,65.0,6300,228.494994
3,1.53,Premium,E,SI1,61.3,59.0,12968,245.8428
4,0.84,Fair,D,SI2,64.5,60.0,2167,131.030912


In [7]:
df.cut.unique()

array(['Very Good', 'Premium', 'Good', 'Fair', 'Ideal'], dtype=object)

In [8]:
df.cut.replace({'Ideal':5, 'Premium':4, 'Good':2, 'Very Good':3, 'Fair':1}, inplace=True)

In [9]:
df.color.unique()

array(['E', 'H', 'D', 'F', 'G', 'I', 'J'], dtype=object)

In [10]:
df.color.replace({'E':2, 'I':6, 'J':7, 'H':5, 'F':3, 'G':4, 'D':1}, inplace=True)

In [11]:
df.clarity.unique()

array(['SI2', 'VS2', 'SI1', 'VVS1', 'VS1', 'VVS2', 'IF', 'I1'],
      dtype=object)

In [12]:
df.clarity.replace({'SI2':1, 'SI1':2, 'VS1':3, 'VS2':4, 'VVS2':5, 'VVS1':6, 'I1':7, 'IF':8}, inplace=True)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,vol
0,1.01,3,2,1,60.0,60.0,4540,167.146056
1,1.1,4,5,4,62.5,58.0,5729,176.70426
2,1.5,2,2,1,61.5,65.0,6300,228.494994
3,1.53,4,2,2,61.3,59.0,12968,245.8428
4,0.84,1,1,1,64.5,60.0,2167,131.030912


In [13]:
X = df.drop(['price'], axis=1)
y = df['price']

In [14]:
X_train = X
y_train = y

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 100,random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)


from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_log_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 291.1893876940047
MSE: 0.012866445853463023
RMSE: 570.8569069709835


In [16]:
model = rf
test_df = pd.read_csv('test.csv')

In [17]:
test_df.cut.unique()

array(['Ideal', 'Very Good', 'Fair', 'Premium', 'Good'], dtype=object)

In [18]:
test_df.cut.replace({'Ideal':5, 'Premium':4, 'Good':2, 'Very Good':3, 'Fair':1}, inplace=True)

In [19]:
test_df.color.unique()

array(['G', 'F', 'E', 'D', 'H', 'J', 'I'], dtype=object)

In [20]:
test_df.color.replace({'E':2, 'I':6, 'J':7, 'H':5, 'F':3, 'G':4, 'D':1}, inplace=True)

In [21]:
test_df.clarity.unique()

array(['VVS1', 'VS2', 'SI1', 'SI2', 'IF', 'VS1', 'VVS2', 'I1'],
      dtype=object)

In [22]:
test_df.clarity.replace({'SI2':1, 'SI1':2, 'VS1':3, 'VS2':4, 'VVS2':5, 'VVS1':6, 'I1':7, 'IF':8}, inplace=True)

In [23]:
test_df['vol'] = test_df.x * test_df.y * test_df.z
test_df.drop(['id','x','y','z'], axis=1, inplace=True)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,vol
0,0.24,5,4,6,62.1,56.0,39.2236
1,1.21,3,3,4,62.9,54.0,197.905488
2,0.5,1,2,2,61.7,68.0,79.880424
3,0.5,5,1,1,62.8,56.0,80.682206
4,1.55,5,2,1,62.3,55.0,252.779208


In [24]:
prediction = model.predict(test_df)

array([ 553.05714286, 8492.71      , 1217.65      , ..., 1535.88      ,
       3570.11      ,  990.90733333])

In [25]:
results_df=pd.DataFrame(prediction,columns=['price'])
results_df.insert(0, "id", range(len(prediction)), True)


In [28]:
results_df.head()
results_df.tail()

Unnamed: 0,id,price
10784,10784,1612.485333
10785,10785,897.252667
10786,10786,1535.88
10787,10787,3570.11
10788,10788,990.907333


In [29]:
results_df.to_csv("results.csv",index=False) 
