In [2]:
# Python > 3.8
import pandas as pd
import numpy  as np

import sklearn
from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns
import tensorflow as tf

import json
import pickle

In [2]:
def GET_METRICS_SINGLE(y_test, y_pred) -> None:
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    r2   = metrics.r2_score(y_test, y_pred)
    print ("RMSE: {:>9,.6f} (кв. корень из среднеквадратичной ошибки)".format( rmse ))
    print ("R2  : {:>9,.3f} (коэфф. детерминации)".format                    ( r2 ))
#--------------------------------------------------------------------------

In [3]:
df = pd.read_csv('./diamonds_nan_PREPARED3.csv', # путь к файлу, (используйте автодополнение)
                   sep=',', # разделитель данных в файле
                   header=0, # номер строки с заголовками, нумерация с нуля
                   # header='None', # если заголовки отсутствуют
                  )
print("Размер таблицы", df.shape)
df[:2]

Размер таблицы (46113, 10)


Unnamed: 0,x,y,z,cut,color,clarity,table,carat,depth,price
0,4.54,4.59,2.78,4,1,4,57.0,0.35,60.9,552.0
1,4.23,4.27,2.66,3,6,2,59.0,0.3,62.6,552.0


In [4]:
features = ['x', 'y', 'z', 'cut', 'color', 'clarity', 'table', 'carat', 'depth']
target   = ['price']

In [20]:
dfX = df[features]
dfY = df[target]

In [6]:
valid_size = 0.3
rand_seed = 8
xTrain, xTest, yTrain, yTest = train_test_split(
    dfX[features],
    dfY[target],
    test_size = valid_size,
    random_state = rand_seed,
    shuffle = True
)

In [8]:
linreg_x1x2x3 = linear_model.LinearRegression()

lin_model_x1x2x3 = linreg_x1x2x3.fit(
    xTrain[['x', 'cut', 'carat']], 
    yTrain[['price']]
)

In [9]:
y_pred_x1x2x3 = lin_model_x1x2x3.predict(xTest[['x', 'cut', 'carat']])

In [10]:
GET_METRICS_SINGLE(yTest, y_pred_x1x2x3)

RMSE: 944.514532 (кв. корень из среднеквадратичной ошибки)
R2  :     0.855 (коэфф. детерминации)


In [11]:
m1_dist = dict()
m1_dist["type"] = str(type(lin_model_x1x2x3))
m1_dist["normNeed"] = False
m1_dist["standartNeed"] = False
m1_dist["input"] = ['x', 'cut', 'carat']
m1_dist["output"] = ['price']
m1_dist["R2"] = metrics.r2_score(yTest[['price']], y_pred_x1x2x3)
m1_dist["RMSE"] = np.sqrt( metrics.mean_squared_error(yTest[['price']], y_pred_x1x2x3))
m1_dist["trainData"] = "diamonds_nan_PREPARED3.csv"
m1_dist["modelName"] = "m1.dump"

In [12]:
m1_file = open("./m1.json", "w")
json.dump(m1_dist, m1_file,  indent=4)
m1_file.close()

In [13]:
m1_out = open("./m1.dump", "wb")
pickle.dump(lin_model_x1x2x3, m1_out)
m1_out.close()

In [14]:
m1_file = open("m1.json", "r")
m1_dist = json.load(m1_file)
m1_file.close()

In [15]:
m1_out = open("m1.dump", "rb")
m1 = pickle.load(m1_out)
m1_out.close()

In [21]:
dfX_test = pd.DataFrame(data = [[4.54, 4, 0.35]], columns = ['x', 'cut','carat'])

In [22]:
y_pred = m1.predict(dfX_test)

In [23]:
y_pred

array([[871.24279611]])

In [24]:
scalerNormX = MinMaxScaler()
scalerNormX.fit(dfX)

dfXNorm = pd.DataFrame (
  data    = scalerNormX.transform(dfX), # значения ячеек    <<--
  columns = dfX.columns,           # названия столбцов
  index   = dfX.index              # идентификаторы строк
)
print("Размер таблицы", dfXNorm.shape)
dfXNorm[:2]

Размер таблицы (46113, 9)


Unnamed: 0,x,y,z,cut,color,clarity,table,carat,depth
0,0.104439,0.019459,0.087421,1.0,0.166667,0.571429,0.4,0.033333,0.508523
1,0.023499,0.007928,0.083648,0.75,1.0,0.285714,0.6,0.0,0.556818


In [25]:
scalerNormY = MinMaxScaler()
scalerNormY.fit(dfY)

dfYNorm = pd.DataFrame (
  data    = scalerNormY.transform(dfY), # значения ячеек    <<--
  columns = dfY.columns,           # названия столбцов
  index   = dfY.index              # идентификаторы строк
)
print("Размер таблицы", dfYNorm.shape)
dfYNorm[:2]

Размер таблицы (46113, 1)


Unnamed: 0,price
0,0.01015
1,0.01015


In [26]:
dfXNorm.describe().loc[["min", "max" ]]

Unnamed: 0,x,y,z,cut,color,clarity,table,carat,depth
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
dfYNorm.describe().loc[["min", "max" ]]

Unnamed: 0,price
min,0.0
max,1.0


In [28]:
valid_size = 0.3 # доля тестовой части в выборке
rand_seed = 8 # начальное состояние генератора случ. чисел

In [29]:
xNorm_train, xNorm_test, yNorm_train, yNorm_test =  train_test_split(
    dfXNorm[features], # исходные данные X
    dfYNorm[target],   # исходные данные y

    test_size    = valid_size, # доля тестовой части в выборке
    random_state =rand_seed,   # начальное состояние генератора случ. чисел
    shuffle=True               # перемешивание
)

In [30]:
totalHistoryLossTrain=[]
totalHistoryLossTest=[]

model = tf.keras.models.Sequential()

model.add(   tf.keras.layers.Input(
    shape=(9 ,  ) # пример: shape=(1,)
  ))

model.add(   tf.keras.layers.Dense(
    units=30,
    activation=None,
))

model.add(    tf.keras.layers.Activation(
    activation = tf.keras.activations.sigmoid,
))

model.add(   tf.keras.layers.Dense(
    units=30,
    activation=None,
))

model.add(    tf.keras.layers.Activation(
    activation = tf.keras.activations.sigmoid,
))

model.add(    tf.keras.layers.Dense(units=1,  activation=None))

fLoss=tf.keras.losses.MeanSquaredError()
fOptimizer=tf.keras.optimizers.Adam(learning_rate=0.01)
fMetric=tf.keras.losses.MeanSquaredError()

model.compile(
    loss=fLoss,
    optimizer=fOptimizer,
    metrics=[fMetric]
)

In [31]:
epochForTrain = 500

history = model.fit(

    xNorm_train [["x", "y", "z", "cut", "color", "clarity", "table", "carat", "depth",]],
    yNorm_train,

    validation_data=(
        xNorm_test [["x", "y", "z", "cut", "color", "clarity", "table", "carat", "depth",]],
        yNorm_test),

    epochs=epochForTrain,
    batch_size=1000,
    verbose=1,
)

totalHistoryLossTrain.extend(history.history['loss'])
if 'val_loss' in history.history.keys():
    totalHistoryLossTest.extend(history.history['val_loss'])

Epoch 1/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 0.1937 - mean_squared_error: 0.1937 - val_loss: 0.0514 - val_mean_squared_error: 0.0514
Epoch 2/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0464 - mean_squared_error: 0.0464 - val_loss: 0.0299 - val_mean_squared_error: 0.0299
Epoch 3/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0260 - mean_squared_error: 0.0260 - val_loss: 0.0162 - val_mean_squared_error: 0.0162
Epoch 4/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0150 - mean_squared_error: 0.0150 - val_loss: 0.0116 - val_mean_squared_error: 0.0116
Epoch 5/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0110 - mean_squared_error: 0.0110 - val_loss: 0.0092 - val_mean_squared_error: 0.0092
Epoch 6/500
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0

In [32]:
yNorm_pred = model.predict( xNorm_test [["x", "y", "z", "cut", "color", "clarity", "table", "carat", "depth",]] )
GET_METRICS_SINGLE(yNorm_test, yNorm_pred)

[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
RMSE:  0.038122 (кв. корень из среднеквадратичной ошибки)
R2  :     0.976 (коэфф. детерминации)


In [33]:
m2_dist = dict()
m2_dist["type"] = str(type(model))
m2_dist["normNeed"] = True
m2_dist["standartNeed"] = False
m2_dist["scalerNormForX"] = "scalerNormForX.dump"
m2_dist["scalerNormForY"] = "scalerNormForY.dump"
m2_dist["input"] = ['x', 'y', 'z', 'cut', 'color', 'clarity', 'table', 'carat', 'depth']
m2_dist["output"] = ['price']
m2_dist["R2"] = metrics.r2_score(yNorm_test, yNorm_pred)
m2_dist["RMSE"] = np.sqrt( metrics.mean_squared_error(yNorm_test, yNorm_pred))
m2_dist["trainData"] = "diamonds_nan_PREPARED3.csv"
m2_dist["modelName"] = "m2.dump"

In [34]:
m2_file = open("./m2.json", "w")
json.dump(m2_dist, m2_file,  indent=4)
m2_file.close()

In [35]:
snx_file = open("./scalerNormForX.dump", "wb")
pickle.dump(scalerNormX, snx_file)
snx_file.close()
sny_file = open("./scalerNormForY.dump", "wb")
pickle.dump(scalerNormY, sny_file)
sny_file.close()

In [36]:
model.save("m2.keras")

In [3]:
m2_file = open("m2.json", "r")
m2_dist = json.load(m2_file)
m2_file.close()

In [4]:
m2=tf.keras.models.load_model("m2.keras")

In [9]:
snx_file = open("scalerNormForX.dump", "rb")
snx = pickle.load(snx_file)
snx_file.close()
sny_file = open("scalerNormForY.dump", "rb")
sny = pickle.load(sny_file)
sny_file.close()

In [10]:
dfX_test = pd.DataFrame(data = [[4.54,4.59,2.78,4,1,4,57,0.35,60.9]], columns = ['x', 'y', 'z', 'cut', 'color', 'clarity', 'table', 'carat', 'depth'])

In [11]:
y_pred=m2.predict(snx.transform(dfX_test))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


In [14]:
sny.inverse_transform(y_pred)

array([[647.621]], dtype=float32)