In [47]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
columns = ['carat', 'cut', 'price']
train = pd.DataFrame(columns=columns, 
          data=[[0.3, 'Ideal', 422],
          [0.31, 'Ideal', 489],
          [0.42, 'Premium', 737],
          [0.5, 'Ideal', 1415],
          [0.51, 'Premium', 1177],
          [0.7, 'Fair', 1865],
          [0.73, 'Fair', 2351],
          [1.01, 'Good', 3768],
          [1.18, 'Very Good', 3965],
          [1.18, 'Ideal', 4838]])
test  = pd.DataFrame(columns=columns, 
          data=[[0.3, 'Ideal', 432],
          [0.34, 'Ideal', 687],
          [0.37, 'Premium', 1124],
          [0.4, 'Good', 720],
          [0.51, 'Ideal', 1397],
          [0.51, 'Very Good', 1284],
          [0.59, 'Ideal', 1437],
          [0.7, 'Ideal', 3419],
          [0.9, 'Premium', 3484],
          [0.9, 'Fair', 2964]])

train

Unnamed: 0,carat,cut,price
0,0.3,Ideal,422
1,0.31,Ideal,489
2,0.42,Premium,737
3,0.5,Ideal,1415
4,0.51,Premium,1177
5,0.7,Fair,1865
6,0.73,Fair,2351
7,1.01,Good,3768
8,1.18,Very Good,3965
9,1.18,Ideal,4838


In [18]:
cut_ranks = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
train.cut = train.cut.map(cut_ranks)
test.cut = test.cut.map(cut_ranks)
features = ['carat', 'cut']
target = 'price'
model = KNeighborsRegressor(n_neighbors=1)
model.fit(train[features], train[target])
y_true = train[target]
y_pred = model.predict(train[features])

trainingError = mean_absolute_error(y_true, y_pred)
trainingError

0.0

How does the train error and test error compare to the previous `KNeighborsRegressor` model from the lesson? (The previous model used `n_neighbors=2` and only the `carat` feature.)

Is this new model overfitting or underfitting? Why do you think this is happening here? 



It's definitely overfitting considering we're getting 0 error as in it's hitting every point.

### 2. More data, two features, linear regression

Use the following code to load data for diamonds under $5,000, and split the data into train and test sets. The training data has almost 30,000 rows, and the test data has almost 10,000 rows.

In [3]:
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor


df = sns.load_dataset('diamonds')
df = df[df.price < 5000]
train, test = train_test_split(df.copy(), random_state=0)
cut_ranks = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
train.cut = train.cut.map(cut_ranks)
test.cut = test.cut.map(cut_ranks)
clarity_rank = {"IF":0,"VVS1":1, "VVS2":2,"VS1":3, "VS2":4,"SI1":5, "SI2":6, "I1":7}
train.clarity = train.clarity.map(clarity_rank)  
test.clarity = test.clarity.map(clarity_rank)  

color_rank = {"J":7, "I":6, "H":5, "G":4, "F":3, "E":2, "D":1 }
train.color = train.color.map(color_rank)
test.color = test.color.map(color_rank)
train.shape, test.shape
train


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
43601,0.31,3,2,5,61.2,58.0,507,4.34,4.38,2.67
52706,0.74,1,5,4,66.1,61.0,2553,5.60,5.57,3.69
1986,0.81,3,4,5,62.3,59.0,3095,5.93,5.98,3.71
48617,0.70,1,4,6,61.5,66.0,1999,5.55,5.60,3.43
10947,0.87,5,4,4,61.8,56.0,4899,6.11,6.13,3.78
...,...,...,...,...,...,...,...,...,...,...
35484,0.43,2,4,5,63.6,55.0,903,4.83,4.79,3.06
46830,0.53,5,1,4,62.2,56.0,1809,5.17,5.18,3.22
45130,0.52,3,2,4,62.5,58.0,1653,5.13,5.18,3.22
35970,0.41,5,1,5,63.0,57.0,920,4.70,4.64,2.94


Then, train a Linear Regression model with the `carat` and `cut` features. Calculate the mean absolute error on the training data and on the test data.

In [5]:
features = ['carat', 'cut']
target = 'price'
model = LinearRegression()
model.fit(train[features], train[target])

y_true = train[target]
y_pred = model.predict(train[features])
trainingError = mean_absolute_error(y_true, y_pred)

y_true = test[target]
y_pred = model.predict(test[features])
testingError = mean_absolute_error(y_true, y_pred)

print(trainingError,testingError)


309.4658665386129 309.5202765379709


Use this model to predict the price of a half carat diamond with "very good" cut

In [37]:
model.predict([[0.5,3]])

array([1489.45526366])

### 3. More data, more features, any model

You choose what features and model type to use! Try to get a better mean absolute error on the test set than your model from the last question.

Refer to [this documentation](https://ggplot2.tidyverse.org/reference/diamonds.html) for more explanation of the features.

Besides `cut`, there are two more ordinal features, which you'd need to encode as numbers if you want to use in your model:

In [78]:
train.describe()
train

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
43601,0.31,3,2,5,61.2,58.0,507,4.34,4.38,2.67
52706,0.74,1,5,4,66.1,61.0,2553,5.60,5.57,3.69
1986,0.81,3,4,5,62.3,59.0,3095,5.93,5.98,3.71
48617,0.70,1,4,6,61.5,66.0,1999,5.55,5.60,3.43
10947,0.87,5,4,4,61.8,56.0,4899,6.11,6.13,3.78
...,...,...,...,...,...,...,...,...,...,...
35484,0.43,2,4,5,63.6,55.0,903,4.83,4.79,3.06
46830,0.53,5,1,4,62.2,56.0,1809,5.17,5.18,3.22
45130,0.52,3,2,4,62.5,58.0,1653,5.13,5.18,3.22
35970,0.41,5,1,5,63.0,57.0,920,4.70,4.64,2.94


In [31]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor

#features = ['color','clarity']
target = 'price'
feature_list = [['color'], ['color','clarity'], ['color','clarity','depth'], ['color','clarity', 'depth', 'cut']]
models = [('Nearest Neighbors Model', KNeighborsRegressor(n_neighbors=2)),
          ('Nearest Neighbors Model', KNeighborsRegressor(n_neighbors=3)),
          ('Linear Regression', LinearRegression()),
          ('Decision Tree', DecisionTreeRegressor(criterion='mae')),
          ('3rd Degree Polynomial', make_pipeline(PolynomialFeatures(degree=3), LinearRegression()))]

#to segregate the m_a_errors into sets of features, we hash them into a dictionary indexed by set of features
mean_abs_error_list = {}


def listToString(sumList):
    tempstring = ''
    for i in sumList:
        tempstring = tempstring + ' ' + str(i)
    return tempstring

#since the true data is static, define it first out of the loop
y_true = train[target]

for features in feature_list:
    tempkey = listToString(features)
    mean_abs_error_list[tempkey]= []
    for name,model in models:
        #Train Model
        model.fit(train[features],train[target])

        #compute mean absolute error per model and return the error paired with the model name
        y_pred = model.predict(train[features])
        mean_abs_error_list[tempkey].append((model, mean_absolute_error(y_true, y_pred)))
        
        
print(mean_abs_error_list)
#mean_abs_error()
#model.coef_,model.intercept_

{' color': [(KNeighborsRegressor(n_neighbors=2), 1284.3395559182563), (KNeighborsRegressor(n_neighbors=3), 1169.8495358563705), (LinearRegression(), 1121.7802873313499), (DecisionTreeRegressor(criterion='mae'), 1074.0048624570709), (Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())]), 1120.2054404432063)], ' color clarity': [(KNeighborsRegressor(n_neighbors=2), 1124.8907477302866), (KNeighborsRegressor(n_neighbors=3), 1056.3925895700863), (LinearRegression(), 1030.8859147075877), (DecisionTreeRegressor(criterion='mae'), 958.595735999184), (Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())]), 1017.2411389506781)], ' color clarity depth': [(KNeighborsRegressor(n_neighbors=2), 1019.7454180692985), (KNeighborsRegressor(n_neighbors=3), 976.6441112131207), (LinearRegression(), 1030.8608041578975), (DecisionTreeRegressor(criterion='mae'), 81

The lowest MAE was the Decision Tree with all four criteria.