In [5]:
#First lets import libraries and modules
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

# Modeling libraries
from sklearn.datasets import load_boston
import statsmodels.formula.api as smf 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config



import pandas as pd
np.random.seed(123)

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("data/train.csv")


In [7]:
encoded = pd.get_dummies(df)
encoded.head(5)

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0.41,63.0,56.0,4.8,4.75,3.01,6.824,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,0.32,61.6,56.0,4.37,4.39,2.7,6.107,0,0,...,1,0,0,0,0,0,0,1,0,0
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.39,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,0,0,...,0,1,0,0,0,0,0,1,0,0


In [8]:
encoded.drop(['id',"depth"], axis=1, inplace=True)

In [9]:
X=encoded.drop(['price'],axis=1)
Y=encoded['price']

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=123)


### Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

0.06745135327759827

In [12]:
#apply to test data

In [13]:
data_submit = pd.read_csv("data/test.csv")


In [14]:
data_submit.drop(["id","depth"], axis=1, inplace=True)
data_submit

Unnamed: 0,carat,cut,color,clarity,table,x,y,z
0,0.33,Ideal,H,IF,55.0,4.44,4.42,2.74
1,0.41,Ideal,E,VS2,54.0,4.79,4.76,2.95
2,0.91,Very Good,E,SI2,59.0,6.16,6.23,3.87
3,0.42,Very Good,G,VS2,57.0,4.76,4.80,2.99
4,0.54,Ideal,G,IF,56.0,5.28,5.25,3.24
...,...,...,...,...,...,...,...,...
13480,0.55,Ideal,F,SI1,56.4,5.26,5.30,3.25
13481,1.12,Premium,H,VS2,59.0,6.77,6.70,4.08
13482,0.37,Ideal,D,SI1,57.0,4.63,4.60,2.84
13483,0.54,Good,E,SI1,63.0,5.25,5.30,3.16


In [15]:
encoded_submit = pd.get_dummies(data_submit)
encoded_submit.head(5)

Unnamed: 0,carat,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.33,55.0,4.44,4.42,2.74,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0.41,54.0,4.79,4.76,2.95,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.91,59.0,6.16,6.23,3.87,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0.42,57.0,4.76,4.8,2.99,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.54,56.0,5.28,5.25,3.24,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
results_submit = rf.predict(encoded_submit)

In [17]:
data_submit = pd.read_csv("data/test.csv")


In [18]:
rf_submission = pd.DataFrame({'id':data_submit.id,'price':results_submit})

In [19]:
rf_submission.shape

(13485, 2)

In [20]:
rf_submission.to_csv("r", index = False)

### Linear Model

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lr = LinearRegression()

In [23]:
lr.fit(X_train, y_train)

LinearRegression()

In [24]:
results_submit = lr.predict(encoded_submit)

In [25]:
lr_submission = pd.DataFrame({'id':data_submit.id,'price':results_submit})

In [26]:
lr_submission.shape

(13485, 2)

In [27]:
lr_submission.to_csv("data/entrega2", index=False)

### Elastic Net

In [28]:
regr = ElasticNet(random_state=0)
regr.fit(X_train, y_train)

ElasticNet(random_state=0)

In [29]:
results_submit = regr.predict(encoded_submit)

In [30]:
regr_submission = pd.DataFrame({'id':data_submit.id,'price':results_submit})

In [31]:
regr_submission.shape

(13485, 2)

In [32]:
regr_submission.to_csv("data/entrega3", index=False)

### Lasso

In [34]:
from sklearn import linear_model

In [35]:
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, y_train)

Lasso(alpha=0.1)

In [36]:
results_submit = clf.predict(encoded_submit)

In [37]:
clf_submission = pd.DataFrame({'id':data_submit.id,'price':results_submit})

In [38]:
clf_submission.shape

(13485, 2)

In [39]:
clf_submission.to_csv("data/entrega4", index=False)