In [1]:
import pandas as pd
import re
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn import datasets, svm, metrics

In [2]:
diamonds=pd.read_csv("../input/data.csv")
diamonds.shape


(40455, 10)

In [3]:
def clean_transformed(df):
    ''' applicable for test and data sets: transforms categorical variables cut, color and clarity, creates new variable volumen, drops x,y,z and 
        depth and table.
    '''
    df['cut']=df['cut'].map({'Fair':1,'Good':2,'Ideal':3,'Very Good':4,'Premium':5})
    df['color']=df['color'].map({'E':1,'J':2,'D':3,'I':4,'H':5,'F':6,'G':7})
    df['clarity']=df['clarity'].map({'I1':1,'SI1':2,'VVS1':3,'SI2':4,'VS2':5,'VS1':6,'VVS2':7,'IF':8})
    df['volumen']=df['x']*df['y']*df['z']
    df =df.drop(['x','y','z','depth','table'], axis=1)
    return df

In [4]:
diamonds1= clean_transformed(diamonds)
diamonds1

Unnamed: 0,carat,cut,color,clarity,price,volumen
0,2.26,3,7,4,12831,366.903680
1,2.43,4,5,4,16170,392.176400
2,0.80,5,6,4,2797,133.001901
3,0.40,3,6,1,630,64.059840
4,0.31,3,7,5,698,51.797610
...,...,...,...,...,...,...
40450,1.11,5,5,2,5315,180.060192
40451,0.73,3,6,5,2762,119.231280
40452,1.26,4,4,6,6855,208.046124
40453,0.72,3,7,4,2297,119.547648


In [5]:
def remove_outliers(df):
    '''only applicable to the data set NOT the test set'''
    z = np.abs(stats.zscore(df))
    df =df[(z<3).all(axis=1)]
    return df


In [6]:
diamonds2= remove_outliers(diamonds1)
diamonds2

Unnamed: 0,carat,cut,color,clarity,price,volumen
2,0.80,5,6,4,2797,133.001901
3,0.40,3,6,1,630,64.059840
4,0.31,3,7,5,698,51.797610
5,0.53,4,6,4,1132,85.502655
6,0.70,5,3,4,1987,110.716020
...,...,...,...,...,...,...
40450,1.11,5,5,2,5315,180.060192
40451,0.73,3,6,5,2762,119.231280
40452,1.26,4,4,6,6855,208.046124
40453,0.72,3,7,4,2297,119.547648


In [7]:
    
# to use a function to do the scalling to both data and test set, we must get the parameters from fitting 
# the data set AFTER treating it using the cleaned_transformed function and the remove_outliers
# TRAIN corresponds to the data set after these two functions have transformed original data set

mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(diamonds2)

def applyscaleMinMax(df):
    '''applicable to transform data and test set, prior fit has extracted parameters from data set '''
    column_names= list(df)
    df = mm_scaler.transform(df)
    df = pd.DataFrame(df, columns=column_names)
    return df

diamonds3= applyscaleMinMax(diamonds2)

In [8]:
diamonds_test=pd.read_csv("../input/test.csv",index_col='id')
diamonds_test

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.56,Ideal,I,VS2,62.1,54.0,5.30,5.33,3.30
1,1.05,Ideal,G,VS2,61.9,56.0,6.56,6.52,4.05
2,0.50,Premium,E,VS2,61.5,56.0,5.11,5.07,3.13
3,0.80,Ideal,F,VS1,62.8,56.0,5.91,5.96,3.73
4,0.54,Ideal,G,VS1,61.4,57.0,5.23,5.20,3.20
...,...,...,...,...,...,...,...,...,...
13480,0.70,Premium,H,SI1,59.0,60.0,5.86,5.81,3.44
13481,0.55,Ideal,G,VVS2,62.3,53.0,5.27,5.29,3.29
13482,1.54,Ideal,G,SI1,61.5,55.0,7.40,7.45,4.57
13483,0.30,Premium,G,SI2,62.7,55.0,4.29,4.26,2.68


In [9]:
diamonds_test1= clean_transformed(diamonds_test)
diamonds_test1

Unnamed: 0_level_0,carat,cut,color,clarity,volumen
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.56,3,4,5,93.221700
1,1.05,3,7,5,173.223360
2,0.50,5,1,5,81.091101
3,0.80,3,6,6,131.384028
4,0.54,3,7,6,87.027200
...,...,...,...,...,...
13480,0.70,5,5,2,117.120304
13481,0.55,3,7,7,91.719607
13482,1.54,3,7,2,251.944100
13483,0.30,5,7,4,48.978072


In [10]:
#remove price from original data set before applying minmax to test set

diamonds4=diamonds2.drop(['price'], axis=1)
diamonds4


Unnamed: 0,carat,cut,color,clarity,volumen
2,0.80,5,6,4,133.001901
3,0.40,3,6,1,64.059840
4,0.31,3,7,5,51.797610
5,0.53,4,6,4,85.502655
6,0.70,5,3,4,110.716020
...,...,...,...,...,...
40450,1.11,5,5,2,180.060192
40451,0.73,3,6,5,119.231280
40452,1.26,4,4,6,208.046124
40453,0.72,3,7,4,119.547648


In [11]:
mm_scaler = preprocessing.MinMaxScaler()
mm_scaler.fit(diamonds4)

diamonds_test2 =applyscaleMinMax(diamonds_test1)

diamonds_test2 

Unnamed: 0,carat,cut,color,clarity,volumen
0,0.178218,0.5,0.500000,0.571429,0.254307
1,0.420792,0.5,1.000000,0.571429,0.472550
2,0.148515,1.0,0.000000,0.571429,0.221215
3,0.297030,0.5,0.833333,0.714286,0.358413
4,0.168317,0.5,1.000000,0.714286,0.237409
...,...,...,...,...,...
13480,0.247525,1.0,0.666667,0.142857,0.319502
13481,0.173267,0.5,1.000000,0.857143,0.250209
13482,0.663366,0.5,1.000000,0.142857,0.687299
13483,0.049505,1.0,1.000000,0.428571,0.133611


In [12]:
diamonds3.columns

Index(['carat', 'cut', 'color', 'clarity', 'price', 'volumen'], dtype='object')

In [13]:
#only for data set original

def modelling (df):
    ''' Only applicable to original data set '''
    X=df.drop(['price'],axis=1)
    y=df['price']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
    models = {
        "decision_tree": DecisionTreeRegressor(random_state = 0), #ok
        "forest": RandomForestRegressor(n_estimators=100), #ok
        "neighbors": KNeighborsRegressor(n_neighbors=2),#ok
        "gradient boosting": GradientBoostingRegressor() #ok
    
        }

    for modelName, model in models.items():
        print(f"Training model: {modelName}")
        model.fit(X_train, y_train)
    
    d = {modelName:model.predict(X_test) for modelName, model in models.items()}

    df = pd.DataFrame(d)
    df["gt"] = y_test.reset_index(drop=True)
    return df





In [14]:
diamonds5 = modelling (diamonds3)
diamonds5

Training model: decision_tree
Training model: forest
Training model: neighbors
Training model: gradient boosting


Unnamed: 0,decision_tree,forest,neighbors,gradient boosting,gt
0,0.525331,0.481680,0.447672,0.440820,0.465766
1,0.175367,0.175476,0.171588,0.179768,0.175175
2,0.458272,0.449831,0.443028,0.424639,0.432396
3,0.191123,0.217875,0.206078,0.186660,0.187408
4,0.290655,0.328321,0.320726,0.334866,0.337731
...,...,...,...,...,...
7877,0.797412,0.763768,0.750528,0.487334,0.729520
7878,0.163069,0.151446,0.140396,0.145891,0.159867
7879,0.247934,0.248772,0.227631,0.264332,0.251777
7880,0.026292,0.029492,0.030968,0.016049,0.026580


In [15]:

# to calculate prediction for test ONLY 
models = {
        "decision_tree": DecisionTreeRegressor(random_state = 0), #ok
        "forest": RandomForestRegressor(n_estimators=100), #ok
        "neighbors": KNeighborsRegressor(n_neighbors=2),#ok
        "gradient boosting": GradientBoostingRegressor() #ok
        }

def modeling(df, model_type):
    ''' only applies to original data set'''
    X=df.drop(['price'],axis=1)
    y=df['price']
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    model = model_type
    model.fit(X_train, y_train)
    return model

def predict(df, model):
    ''' works for test only'''
    d = model.predict(df)
    df = pd.DataFrame(d)
    return df

# to obtain predicions for test set in predict use test and in modeling use data set
for model_name, model in models.items(): # 1) iterar por cada tipo de modelo
    trained_model = modeling(diamonds3, model) # 2) data, model entrenar el modelo con data original, añadir métricas
    prediction = predict(diamonds_test2, trained_model) # 3) prediccion en test con el modelo entrenado
    #print (model_name)
    prediction= prediction.rename(columns={0: "price"})
    prediction.index.name= 'id'
    prediction.to_csv(f'../output/{model_name}.csv')
    print(prediction)


    
#df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
    
    

          price
id             
0      0.064049
1      0.405688
2      0.090950
3      0.232306
4      0.079485
...         ...
13480  0.113367
13481  0.117915
13482  0.698392
13483  0.004355
13484  0.066419

[13485 rows x 1 columns]
          price
id             
0      0.065922
1      0.400425
2      0.087336
3      0.230958
4      0.084257
...         ...
13480  0.116071
13481  0.119782
13482  0.736632
13483  0.009284
13484  0.059380

[13485 rows x 1 columns]
          price
id             
0      0.066739
1      0.411900
2      0.087043
3      0.227182
4      0.081727
...         ...
13480  0.121437
13481  0.107122
13482  0.694389
13483  0.007942
13484  0.061775

[13485 rows x 1 columns]
          price
id             
0      0.072443
1      0.391036
2      0.079532
3      0.210275
4      0.093822
...         ...
13480  0.135697
13481  0.107448
13482  0.678811
13483 -0.000239
13484  0.058932

[13485 rows x 1 columns]


In [16]:
prediction.columns

Index(['price'], dtype='object')

In [17]:
diamonds2

Unnamed: 0,carat,cut,color,clarity,price,volumen
2,0.80,5,6,4,2797,133.001901
3,0.40,3,6,1,630,64.059840
4,0.31,3,7,5,698,51.797610
5,0.53,4,6,4,1132,85.502655
6,0.70,5,3,4,1987,110.716020
...,...,...,...,...,...,...
40450,1.11,5,5,2,5315,180.060192
40451,0.73,3,6,5,2762,119.231280
40452,1.26,4,4,6,6855,208.046124
40453,0.72,3,7,4,2297,119.547648


In [18]:
diamonds2.rename(columns={"cut": "price"})


Unnamed: 0,carat,price,color,clarity,price.1,volumen
2,0.80,5,6,4,2797,133.001901
3,0.40,3,6,1,630,64.059840
4,0.31,3,7,5,698,51.797610
5,0.53,4,6,4,1132,85.502655
6,0.70,5,3,4,1987,110.716020
...,...,...,...,...,...,...
40450,1.11,5,5,2,5315,180.060192
40451,0.73,3,6,5,2762,119.231280
40452,1.26,4,4,6,6855,208.046124
40453,0.72,3,7,4,2297,119.547648


In [19]:
diamonds2.Index.delete()

AttributeError: 'DataFrame' object has no attribute 'Index'