In [18]:
import pandas as pd
import re
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn import datasets, svm, metrics

In [19]:
diamonds=pd.read_csv("../input/data.csv")
diamonds.shape


(40455, 10)

In [20]:
def clean_transformed(df):
    ''' applicable for test and data sets: transforms categorical variables cut, color and clarity, creates new variable volumen, drops x,y,z and 
        depth and table.
    '''
    df['cut']=df['cut'].map({'Fair':0,'Good':1,'Very Good': 2,'Premium':3,'Ideal':4})
    df['color']=df['color'].map({'J':0,'I':1,'H':2,'G':3,'F':4,'E':5,'D':6})
    df['clarity']=df['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})
    df['volumen']=df['x']*df['y']*df['z']
    df =df.drop(['x','y','z','depth','table'], axis=1)
    return df

In [21]:
diamonds1= clean_transformed(diamonds)
diamonds1

Unnamed: 0,carat,cut,color,clarity,price,volumen
0,2.26,4,3,1,12831,366.903680
1,2.43,2,2,1,16170,392.176400
2,0.80,3,4,1,2797,133.001901
3,0.40,4,4,0,630,64.059840
4,0.31,4,3,3,698,51.797610
...,...,...,...,...,...,...
40450,1.11,3,2,2,5315,180.060192
40451,0.73,4,4,3,2762,119.231280
40452,1.26,2,1,4,6855,208.046124
40453,0.72,4,3,1,2297,119.547648


In [22]:
def remove_outliers(df):
    '''only applicable to the data set NOT the test set'''
    z = np.abs(stats.zscore(df))
    df =df[(z<3).all(axis=1)]
    return df


In [23]:
diamonds2= remove_outliers(diamonds1)
diamonds2

Unnamed: 0,carat,cut,color,clarity,price,volumen
2,0.80,3,4,1,2797,133.001901
3,0.40,4,4,0,630,64.059840
4,0.31,4,3,3,698,51.797610
5,0.53,2,4,1,1132,85.502655
6,0.70,3,6,1,1987,110.716020
...,...,...,...,...,...,...
40450,1.11,3,2,2,5315,180.060192
40451,0.73,4,4,3,2762,119.231280
40452,1.26,2,1,4,6855,208.046124
40453,0.72,4,3,1,2297,119.547648


In [24]:
diamonds_test=pd.read_csv("../input/test.csv",index_col='id')
diamonds_test

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.56,Ideal,I,VS2,62.1,54.0,5.30,5.33,3.30
1,1.05,Ideal,G,VS2,61.9,56.0,6.56,6.52,4.05
2,0.50,Premium,E,VS2,61.5,56.0,5.11,5.07,3.13
3,0.80,Ideal,F,VS1,62.8,56.0,5.91,5.96,3.73
4,0.54,Ideal,G,VS1,61.4,57.0,5.23,5.20,3.20
...,...,...,...,...,...,...,...,...,...
13480,0.70,Premium,H,SI1,59.0,60.0,5.86,5.81,3.44
13481,0.55,Ideal,G,VVS2,62.3,53.0,5.27,5.29,3.29
13482,1.54,Ideal,G,SI1,61.5,55.0,7.40,7.45,4.57
13483,0.30,Premium,G,SI2,62.7,55.0,4.29,4.26,2.68


In [25]:
diamonds_test1= clean_transformed(diamonds_test)
diamonds_test1

Unnamed: 0_level_0,carat,cut,color,clarity,volumen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.56,4,1,3,93.221700
1,1.05,4,3,3,173.223360
2,0.50,3,5,3,81.091101
3,0.80,4,4,4,131.384028
4,0.54,4,3,4,87.027200
...,...,...,...,...,...
13480,0.70,3,2,2,117.120304
13481,0.55,4,3,5,91.719607
13482,1.54,4,3,2,251.944100
13483,0.30,3,3,1,48.978072


In [26]:
#only for data set original

def modelling (df):
    ''' Only applicable to original data set '''
    X=df.drop(['price'],axis=1)
    y=df['price']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    models = {
        "decision_tree": DecisionTreeRegressor(random_state = 0), #ok
        "forest": RandomForestRegressor(n_estimators=100), #ok
        "neighbors": KNeighborsRegressor(n_neighbors=2),#ok
        "gradient boosting": GradientBoostingRegressor() #ok
    
        }

    for modelName, model in models.items():
        print(f"Training model: {modelName}")
        model.fit(X_train, y_train)
    
    d = {modelName:model.predict(X_test) for modelName, model in models.items()}

    df = pd.DataFrame(d)
    df["gt"] = y_test.reset_index(drop=True)
    return df

In [27]:
#model predicted 

diamonds3 = modelling (diamonds2)
diamonds3

Training model: decision_tree
Training model: forest
Training model: neighbors
Training model: gradient boosting


Unnamed: 0,decision_tree,forest,neighbors,gradient boosting,gt
0,6459.0,6509.219000,6039.0,6195.022002,5761
1,12013.0,11596.060000,11880.5,11333.338958,11904
2,438.0,445.240000,458.0,540.412030,438
3,5748.5,5549.905000,5241.5,5131.729939,5802
4,7465.0,7592.590000,7293.5,7216.455570,8133
...,...,...,...,...,...
7877,2479.0,2517.589000,2259.5,2464.988805,2657
7878,814.0,872.422000,814.0,956.655210,814
7879,5969.0,5912.856667,6288.0,6038.964713,5382
7880,1860.0,1800.431667,1793.0,1967.937250,2020


In [28]:
results2=metrics.mean_squared_error(diamonds3['gt'], diamonds3['decision_tree'], squared=True)
results2
print(f' mean square error of decision tree is {results2}')

 mean square error of decision tree is 384291.1449963348


In [29]:
results4=metrics.mean_squared_error(diamonds3['gt'], diamonds3['forest'], squared=True)
results4
print(f' mean square error of forest is {results4}')

 mean square error of forest is 222862.85822066554


In [30]:
results4**(1/2)

472.0835288597406

In [31]:
results6=metrics.mean_squared_error(diamonds3['gt'], diamonds3['neighbors'], squared=True)
results6
print(f' mean square error of neighbors is {results6}')

 mean square error of neighbors is 391153.7362661761


In [32]:
results8=metrics.mean_squared_error(diamonds3['gt'], diamonds3['gradient boosting'], squared=True)
results8
print(f' mean square error of gradient is {results8}')

 mean square error of gradient is 268638.36446988443


In [33]:

# to calculate prediction for test ONLY 
models = {
        "decision_tree": DecisionTreeRegressor(random_state = 0), #ok
        "forest": RandomForestRegressor(n_estimators=100), #ok
        "neighbors": KNeighborsRegressor(n_neighbors=2),#ok
        "gradient boosting": GradientBoostingRegressor() #ok
        }

def modeling(df, model_type):
    ''' only applies to original data set'''
    X=df.drop(['price'],axis=1)
    y=df['price']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    model = model_type
    model.fit(X_train, y_train)
    return model

def predict(df, model):
    ''' works for test only'''
    d = model.predict(df)
    df = pd.DataFrame(d)
    return df

# to obtain predicions for test set in predict use test and in modeling use data set
for model_name, model in models.items(): # 1) iterar por cada tipo de modelo
    trained_model = modeling(diamonds2, model) # 2) data, model entrenar el modelo con data original, añadir métricas
    prediction = predict(diamonds_test1, trained_model) # 3) prediccion en test con el modelo entrenado
    #print (model_name)
    prediction= prediction.rename(columns={0: "price"})
    prediction.index.name= 'id'
    prediction.to_csv(f'../output/{model_name}.csv')
    print(prediction)

    

         price
id            
0       1326.0
1       6486.0
2       1629.0
3       3953.0
4       1567.0
...        ...
13480   2048.0
13481   2016.0
13482  11105.0
13483    394.0
13484   1308.0

[13485 rows x 1 columns]
              price
id                 
0       1330.540000
1       6683.410000
2       1683.380000
3       3912.327452
4       1625.451333
...             ...
13480   2120.290000
13481   2015.532500
13482  11730.741667
13483    458.947500
13484   1273.550000

[13485 rows x 1 columns]
         price
id            
0       1368.0
1       6720.5
2       1685.0
3       3943.0
4       1644.0
...        ...
13480   2198.0
13481   2124.0
13482  12219.0
13483    506.0
13484   1215.0

[13485 rows x 1 columns]
              price
id                 
0       1455.874398
1       6385.305390
2       1545.842163
3       3804.254804
4       1722.197285
...             ...
13480   2340.441575
13481   1969.639977
13482  11411.298060
13483    352.772448
13484   1241.097838

[13485 rows