In [10]:
# On va essayer de prédire les prix des diamands en machine learning à partir d'une dataset

import pandas as pd

df = pd.read_csv("diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [11]:
df["cut"].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [12]:
# df["cut"].astype("category").cat.codes # Une façon simple et désordonnée de transformer les chaines de caractères en nombres (parce qu'en ML, on traite des nombres)

# Ici, on veut garder l'ordre (Premium est mieux que Fair, etc ...)
cut_class_dict = {"Fair":1, "Good":2, "Very Good":3, "Premium":4, "Ideal":5}

# Pareil pour les autres colonnes
clarity_dict = {"I3":1, "I2":2, "I1":3, "SI2":4, "SI1":5, "VS2":6, "VS1":7, "VVS2":8, "VVS1":9, "IF":10, "FL":11} # Ordre trouvable sur Kaggle
color_dict = {"J":0+1, "I":1+1, "H":2+1, "G":3+1, "F":4+1, "E":5+1, "D":6+1} # Comme un con j'ai commencé à 0 au lieu de 1 ==> +1

df['cut'] = df["cut"].map(cut_class_dict)
df['clarity'] = df["clarity"].map(clarity_dict)
df['color'] = df["color"].map(color_dict)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [18]:
import sklearn
from sklearn import svm, preprocessing

# C'est toujours une bonne idée de tout shuffle avant d'entrainer un modèle

df = sklearn.utils.shuffle(df)
X = df.drop('price', axis=1).values # Features
x = preprocessing.scale(X) # Optionnel, mais c'est souvent bien de scaler pour simplifier les données pour un modèle
y = df['price'].values # Labels

test_size = 200

# Pour la phase d'entrainement
X_train = X[:-test_size]
y_train = y[:-test_size]

# Pour la phase de test
X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR(kernel="linear") # clf = classifieur. On va faire un kernel linéaire
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [19]:
clf.score(X_test, y_test)

0.8458217526384816

In [21]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 213.01345143204344, Actual: 756
Model: 4389.5601271223295, Actual: 3972
Model: 2785.4669646359052, Actual: 1881
Model: 5413.01751664217, Actual: 6181
Model: 8132.858932017682, Actual: 14068
Model: 3594.470750773571, Actual: 3264
Model: 3331.7731057042165, Actual: 2818
Model: 6089.293107555624, Actual: 4168
Model: 1081.0265481071438, Actual: 1000
Model: 45.25628674665859, Actual: 795
Model: 5629.674144409006, Actual: 6019
Model: 416.30177930154423, Actual: 851
Model: 4927.608989414619, Actual: 4721
Model: 235.74682619270789, Actual: 700
Model: 7304.001590877311, Actual: 8930
Model: 1568.904659094911, Actual: 1223
Model: -113.89229709451865, Actual: 658
Model: 4040.7736851946374, Actual: 3574
Model: 3109.2179496588415, Actual: 3089
Model: 1928.3229616853478, Actual: 1587
Model: 2261.0116337387535, Actual: 1810
Model: 300.2463652271417, Actual: 625
Model: 100.8177729030449, Actual: 520
Model: 4836.755680723416, Actual: 4571
Model: 6256.787259022514, Actual: 6619
Model: 7485.1185118

In [22]:
clf = svm.SVR(kernel="rbf") # clf = classifieur. On va faire un kernel linéaire
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
clf.score(X_test, y_test)

0.39303633505771207

In [24]:
for X, y in zip(X_test, y_test):
    print(f"Model: {clf.predict([X])[0]}, Actual: {y}")

Model: 2577.665586405208, Actual: 756
Model: 3127.4073593415424, Actual: 3972
Model: 2560.620721232532, Actual: 1881
Model: 5237.060598294904, Actual: 6181
Model: 3817.9133351615455, Actual: 14068
Model: 3056.174805321093, Actual: 3264
Model: 2780.9983549915028, Actual: 2818
Model: 3375.1921655091232, Actual: 4168
Model: 1520.6897090390096, Actual: 1000
Model: 482.65854526456997, Actual: 795
Model: 5027.005174554248, Actual: 6019
Model: 1066.627298354507, Actual: 851
Model: 3646.4526287859358, Actual: 4721
Model: 970.5146272898494, Actual: 700
Model: 4691.826311291521, Actual: 8930
Model: 1982.8291090109883, Actual: 1223
Model: 566.3175879859327, Actual: 658
Model: 3685.2431515320195, Actual: 3574
Model: 3043.772511471777, Actual: 3089
Model: 1744.8106338703487, Actual: 1587
Model: 1999.150626407914, Actual: 1810
Model: 806.1981727421226, Actual: 625
Model: 797.977482414226, Actual: 520
Model: 4255.925925449594, Actual: 4571
Model: 4083.225992865987, Actual: 6619
Model: 4313.6188593779