In [2]:
import pandas as pd

df = pd.read_csv("Desktop/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [4]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}

In [5]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [6]:
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [7]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [8]:
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
y = df["price"].values

In [9]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [10]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

-268705301.9693671


In [11]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

61700322.60719919 6968
42514900.22675085 4332
57473726.298755646 9900
51013067.063538074 2513
58359751.16225386 10428
19982333.36271286 628
56514108.35135555 11901
61120097.38359499 1247
31987661.98994541 1890
51493959.66968632 1446


In [12]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

-0.1296401662127522


In [13]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2408.9245571348947 6968
2494.1615603668592 4332
2487.092831156441 9900
2385.8355881604166 2513
2512.6590924478173 10428
2296.015751253337 628
2515.1885111015235 11901
2406.287636038493 1247
2346.9956442753964 1890
2441.01288616581 1446


In [14]:
clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-31235395.01886589
-14755475.154144287 6968
-3286975.41766572 4332
-13385371.702237368 9900
-7549317.102905512 2513
-14309170.527423382 10428
9729363.043242693 628
-11935428.429179907 11901
-16032270.142039537 1247
2996565.7110373974 1890
-11152559.600166798 1446


In [15]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")

0.5670510809009672
model predicts 2948.2868136698257, real value: 2698
model predicts 3898.122625320537, real value: 3394
model predicts 1450.8514805274556, real value: 489
model predicts 3349.4997318232668, real value: 3499
model predicts 1736.6119289478836, real value: 1619
model predicts 3425.4700713917323, real value: 3763
model predicts 6114.17623415269, real value: 17712
model predicts 4698.33973754814, real value: 4370
model predicts 6802.727527258594, real value: 8887
model predicts 4090.8108527786762, real value: 3408
