In [1]:
import pandas as pd

df = pd.read_csv("datasets/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
df['cut'].astype('category').cat.codes

1        2
2        3
3        1
4        3
5        1
6        4
7        4
8        4
9        0
10       4
11       1
12       2
13       3
14       2
15       3
16       3
17       2
18       1
19       1
20       4
21       1
22       4
23       4
24       4
25       4
26       4
27       3
28       4
29       4
30       4
        ..
53911    3
53912    3
53913    3
53914    1
53915    1
53916    2
53917    1
53918    4
53919    3
53920    2
53921    4
53922    4
53923    4
53924    2
53925    2
53926    2
53927    2
53928    1
53929    3
53930    2
53931    3
53932    3
53933    4
53934    4
53935    3
53936    2
53937    1
53938    4
53939    3
53940    2
Length: 53940, dtype: int8

In [4]:
cut_class_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
clarity_dict = {'I3': 1, 'I2': 2, 'I1': 3, 'SI2': 4, 'SI1': 5, 'VS2': 6, 'VS1': 7, 'VVS2': 8, 'VVS1': 9, 'IF': 10, 'FL': 11}
color_dict = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}

df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [8]:
import sklearn
from sklearn import svm, preprocessing

In [12]:
df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values
X = preprocessing.scale(X)
y = df['price'].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]


clf = svm.SVR(kernel='linear')
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
X

array([[-1.05049061, -0.80969515,  0.34930404, ..., -1.27582507,
        -1.23850462, -1.18852618],
       [ 0.44737803, -1.70527938,  0.93716275, ...,  0.56950532,
         0.60017476,  0.44107923],
       [-1.00829713,  0.98147332, -0.23855468, ..., -1.23125187,
        -1.23850462, -1.14601473],
       ...,
       [-0.10113725, -1.70527938,  0.34930404, ...,  0.04354159,
        -0.10027453,  0.17184008],
       [-0.20662095,  0.08588908, -0.23855468, ..., -0.09017801,
        -0.14405261, -0.02654667],
       [ 0.48957152,  0.98147332, -0.23855468, ...,  0.68539563,
         0.70524215,  0.71031839]])

In [14]:
clf.score(X_test, y_test)

0.8236357942301753

In [15]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 4834.3071367784305, Actual: 5152
Model: 5542.168793774873, Actual: 6947
Model: -329.84356696443365, Actual: 451
Model: 1952.5735052912967, Actual: 1786
Model: 4447.084761248118, Actual: 3569
Model: 4319.555270948839, Actual: 3172
Model: 405.701751052502, Actual: 710
Model: -104.34293988359559, Actual: 544
Model: 12472.00950669137, Actual: 11119
Model: 3159.1530491327444, Actual: 2656
Model: 5473.9092728143705, Actual: 5139
Model: 690.1329997993776, Actual: 1061
Model: 8229.99408733231, Actual: 9236
Model: 940.5245145182557, Actual: 1063
Model: 2034.2624564226562, Actual: 1679
Model: 1859.5761220091222, Actual: 1716
Model: 5163.118144424397, Actual: 6204
Model: 808.5507128060158, Actual: 764
Model: 1589.5886801780523, Actual: 1452
Model: 4362.062825907748, Actual: 3676
Model: 1105.4787641710127, Actual: 1035
Model: 4926.0347404933655, Actual: 4989
Model: 172.72821198018028, Actual: 720
Model: -24.922377731571487, Actual: 552
Model: 2836.0208556227594, Actual: 2030
Model: 653.7645

In [16]:
clf = svm.SVR(kernel='rbf')
clf.fit(X_train, y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
print(clf.score(X_test, y_test))

for X,y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

0.6811070868062524
Model: 5007.212615495802, Actual: 5152
Model: 6111.855815628338, Actual: 6947
Model: 591.2705805140595, Actual: 451
Model: 1627.971802905501, Actual: 1786
Model: 4260.743690689518, Actual: 3569
Model: 4005.2478439845013, Actual: 3172
Model: 726.4913624486185, Actual: 710
Model: 1154.7524107643412, Actual: 544
Model: 6390.827875778325, Actual: 11119
Model: 2488.0184582113798, Actual: 2656
Model: 5390.932539630674, Actual: 5139
Model: 715.0803800847339, Actual: 1061
Model: 5157.282362212145, Actual: 9236
Model: 706.1749247107141, Actual: 1063
Model: 1795.6100336118561, Actual: 1679
Model: 1684.407134479034, Actual: 1716
Model: 5238.109962981834, Actual: 6204
Model: 837.2326260172208, Actual: 764
Model: 1788.1690187513802, Actual: 1452
Model: 3689.858631467089, Actual: 3676
Model: 2786.638668679694, Actual: 1035
Model: 4577.270572114912, Actual: 4989
Model: 547.4435387117514, Actual: 720
Model: 1235.9058087642939, Actual: 552
Model: 2791.714896985607, Actual: 2030
Model