In [63]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("artDataset.csv")

# Assuming 'Price' is the target variable
X = df[['artist', 'title', 'signed', 'condition', 'period', 'movement']]  # Features
y = df['price']  # Target variable

# Encoding categorical variables if needed
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X[column] = label_encoders[column].fit_transform(X[column])

# Convert target variable to numeric format
y = y.str.replace(' USD', '').astype(float)  # Remove ' USD' and convert to float

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluation
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 score:", r2_train)
print("Test R^2 score:", r2_test)






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoders[column].fit_transform(X[column])


Train RMSE: 307.23769286690396
Test RMSE: 303.92076241926225
Train R^2 score: 0.15266602724488898
Test R^2 score: 0.019716829445878825


In [105]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
import zipfile
import os

# Extract images from zip file
with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall('art_dataset_extracted')

# Load dataset
df = pd.read_csv("artDataset.csv")

# Assuming 'Price' is the target variable
X_textual = df[['artist', 'title', 'signed', 'condition', 'period', 'movement']].values  # Textual features
X_numeric = df.select_dtypes(include=np.number).values  # Numerical features
y = df['price'].values  # Target variable

# Encoding categorical variables
label_encoders = {}
for column in df.select_dtypes(include='object').columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Split dataset into train and test sets
X_train_textual, X_test_textual, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(
    X_textual, X_numeric, y, test_size=0.2, random_state=42
)

# Image data generator
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Define the image directory
image_directory = "art_dataset_extracted/imgDataset"  # Path to the directory containing images

# Generate file paths for images
# Generate file paths for images
df['Image_Path'] = 'image_' + (df.index + 1).astype(str) + '.png'  # Assuming image file extensions are '.jpg'

# Splitting the dataset into train and validation sets
train_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=image_directory,
    x_col='Image_Path',  # Column containing the file paths of the images
    y_col="price",
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw',
    subset='training'
)

validation_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=image_directory,
    x_col='Image_Path',  # Column containing the file paths of the images
    y_col="price",
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw',
    subset='validation'
)

# CNN model
image_input = Input(shape=(224, 224, 3))
conv1 = Conv2D(32, kernel_size=(3, 3), activation='relu')(image_input)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, kernel_size=(3, 3), activation='relu')(pool1)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
flatten = Flatten()(pool2)

# Numerical and textual input
numeric_input = Input(shape=(X_numeric.shape[1],))
textual_input = Input(shape=(X_textual.shape[1],))

# Combine all inputs
combined_input = Concatenate()([flatten, numeric_input, textual_input])

# Fully connected layers
dense1 = Dense(128, activation='relu')(combined_input)
output = Dense(1)(dense1)

# Model
model = Model(inputs=[image_input, numeric_input, textual_input], outputs=output)

# Compile model
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=[RootMeanSquaredError()])

# # Train model
# history = model.fit(
#     [train_generator, X_train_numeric, X_train_textual],
#     y_train,
#     validation_data=([validation_generator, X_test_numeric, X_test_textual], y_test),
#     epochs=10,
#     batch_size=32
# )


Found 0 validated image filenames.
Found 0 validated image filenames.




In [104]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
import zipfile
import os

# Extract images from zip file
with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
    zip_ref.extractall('art_dataset_extracted')

# Load dataset
df = pd.read_csv("artDataset.csv")

# Assuming 'Price' is the target variable
X_textual = df[['artist', 'title', 'signed', 'condition', 'period', 'movement']].values  # Textual features
X_numeric = df.select_dtypes(include=np.number).values  # Numerical features
y = df['price'].values  # Target variable

# Encoding categorical variables
label_encoders = {}
for column in df.select_dtypes(include='object').columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Split dataset into train and test sets
X_train_textual, X_test_textual, X_train_numeric, X_test_numeric, y_train, y_test = train_test_split(
    X_textual, X_numeric, y, test_size=0.2, random_state=42
)

# Image data generator
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Define the image directory
image_directory = "art_dataset_extracted/imgDataset"  # Path to the directory containing images

# Generate file paths for images
df['Image_Path'] = 'image_' + (df.index + 1).astype(str) + '.jpg'  # Assuming image file extensions are '.jpg'

# Splitting the dataset into train and validation sets
train_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=image_directory,
    x_col='Image_Path',  # Column containing the file paths of the images
    y_col="price",
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw',
    subset='training'
)

validation_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=image_directory,
    x_col='Image_Path',  # Column containing the file paths of the images
    y_col="price",
    target_size=(224, 224),
    batch_size=32,
    class_mode='raw',
    subset='validation'
)

# CNN model
image_input = Input(shape=(224, 224, 3))
conv1 = Conv2D(32, kernel_size=(3, 3), activation='relu')(image_input)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, kernel_size=(3, 3), activation='relu')(pool1)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
flatten = Flatten()(pool2)

# Numerical and textual input
numeric_input = Input(shape=(X_numeric.shape[1],))
textual_input = Input(shape=(X_textual.shape[1],))

# Combine all inputs
combined_input = Concatenate()([flatten, numeric_input, textual_input])

# Fully connected layers
dense1 = Dense(128, activation='relu')(combined_input)
output = Dense(1)(dense1)

# Model
model = Model(inputs=[image_input, numeric_input, textual_input], outputs=output)

# Compile model
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=[RootMeanSquaredError()])

# Train model
history = model.fit(
    [train_generator.next()[0], X_train_numeric, X_train_textual],
    train_generator.next()[1],
    validation_data=([validation_generator.next()[0], X_test_numeric, X_test_textual], validation_generator.next()[1]),
    epochs=10,
    batch_size=32
)


Found 0 validated image filenames.
Found 0 validated image filenames.




ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Load dataset
df = pd.read_csv("artDataset.csv")

# Assuming 'Price' is the target variable
X = df[['artist', 'title', 'signed', 'condition', 'period', 'movement']]  # Features
y = df['price']  # Target variable

# Encoding categorical variables if needed
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X[column] = label_encoders[column].fit_transform(X[column])

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert target variable to numpy array
y = y.str.replace(' USD', '').astype(float).values  # Remove ' USD' and convert to float

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape input data for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Inverse scaling for predicted values
y_pred_train = scaler.inverse_transform(y_pred_train)
y_pred_test = scaler.inverse_transform(y_pred_test)

# Evaluation
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 score:", r2_train)
print("Test R^2 score:", r2_test)








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoders[column].fit_transform(X[column])



Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


ValueError: non-broadcastable output operand with shape (603,1) doesn't match the broadcast shape (603,6)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Load dataset
df = pd.read_csv("artDataset.csv")

# Assuming 'Price' is the target variable
X = df[['artist', 'title', 'signed', 'condition', 'period', 'movement']]  # Features
y = df['price']  # Target variable

# Encoding categorical variables if needed
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        X[column] = label_encoders[column].fit_transform(X[column])

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert target variable to numpy array
y = y.str.replace(' USD', '').astype(float).values  # Remove ' USD' and convert to float

# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape input data for LSTM
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Define LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))  # Adjusted output units to 1

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Inverse scaling for predicted values
y_pred_train = scaler.inverse_transform(y_pred_train)
y_pred_test = scaler.inverse_transform(y_pred_test)

# Evaluation
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 score:", r2_train)
print("Test R^2 score:", r2_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = label_encoders[column].fit_transform(X[column])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


ValueError: non-broadcastable output operand with shape (603,1) doesn't match the broadcast shape (603,6)

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

# Load dataset
data = pd.read_csv('artDataset.csv')

# Preprocess the Data
data.drop('Unnamed: 0', axis=1, inplace=True)
data['price'] = data['price'].str.extract('(\d+\.\d+)')[0].astype(float)
data['artist'].fillna('Unknown Artist', inplace=True)
data['yearCreation'] = data['yearCreation'].str.extract('(\d{4})').fillna('Unknown')

# Encode Categorical Variables
encoder = OneHotEncoder()
categorical_columns = ['artist', 'title', 'signed', 'condition', 'period', 'movement', 'yearCreation']
encoded_data = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out())
data = pd.concat([data.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Split the Data into Training and Testing Sets
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input data for LSTM
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=1)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluation
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 score:", r2_train)
print("Test R^2 score:", r2_test)


AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_out'