In [11]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load data from CSV
file_path = 'ApplicationData/TestDFFT/TestDfft(Quartz-Corona).csv'
data = pd.read_csv(file_path)

# Assuming the first two columns contain text data
text_data_col1 = data.iloc[:, 0]
text_data_col2 = data.iloc[:, 1]
rest_of_data = data.iloc[:, 2:]

# Vectorize first text column using TF-IDF
vectorizer_col1 = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed
tfidf_features_col1 = vectorizer_col1.fit_transform(text_data_col1).toarray()

# Vectorize second text column using TF-IDF
vectorizer_col2 = TfidfVectorizer(max_features=500)  # Adjust the number of features as needed
tfidf_features_col2 = vectorizer_col2.fit_transform(text_data_col2).toarray()

# Define the autoencoder for the first text column (embedding to 10 dimensions)
input_dim_col1 = tfidf_features_col1.shape[1]
encoding_dim_col1 = 10

input_layer_col1 = Input(shape=(input_dim_col1,))
encoded_col1 = Dense(128, activation='relu')(input_layer_col1)
encoded_col1 = Dense(encoding_dim_col1, activation='relu')(encoded_col1)
decoded_col1 = Dense(128, activation='relu')(encoded_col1)
decoded_col1 = Dense(input_dim_col1, activation='sigmoid')(decoded_col1)
autoencoder_col1 = Model(input_layer_col1, decoded_col1)

# Encoder for extracting embeddings for the first column
encoder_col1 = Model(input_layer_col1, encoded_col1)

# Compile and train the autoencoder for the first column
autoencoder_col1.compile(optimizer='adam', loss='mse')
autoencoder_col1.fit(tfidf_features_col1, tfidf_features_col1, epochs=50, batch_size=256, validation_split=0.2)

# Define the autoencoder for the second text column (embedding to 5 dimensions)
input_dim_col2 = tfidf_features_col2.shape[1]
encoding_dim_col2 = 5

input_layer_col2 = Input(shape=(input_dim_col2,))
encoded_col2 = Dense(128, activation='relu')(input_layer_col2)
encoded_col2 = Dense(encoding_dim_col2, activation='relu')(encoded_col2)
decoded_col2 = Dense(128, activation='relu')(encoded_col2)
decoded_col2 = Dense(input_dim_col2, activation='sigmoid')(decoded_col2)
autoencoder_col2 = Model(input_layer_col2, decoded_col2)

# Encoder for extracting embeddings for the second column
encoder_col2 = Model(input_layer_col2, encoded_col2)

# Compile and train the autoencoder for the second column
autoencoder_col2.compile(optimizer='adam', loss='mse')
autoencoder_col2.fit(tfidf_features_col2, tfidf_features_col2, epochs=50, batch_size=256, validation_split=0.2)

# Extract embeddings
embeddings_col1 = encoder_col1.predict(tfidf_features_col1)
embeddings_col2 = encoder_col2.predict(tfidf_features_col2)

# Convert embeddings to DataFrames
embeddings_df_col1 = pd.DataFrame(embeddings_col1, columns=[f'emb_col1_{i+1}' for i in range(encoding_dim_col1)])
embeddings_df_col2 = pd.DataFrame(embeddings_col2, columns=[f'emb_col2_{i+1}' for i in range(encoding_dim_col2)])

# Combine embeddings with the rest of the data
final_data = pd.concat([embeddings_df_col1, embeddings_df_col2, rest_of_data.reset_index(drop=True)], axis=1)

# Save the combined data to CSV
final_data.to_csv('EncoderData/TestDFFT/TestDfft(Quartz-Corona)_encoding.csv', index=False)

print("Data with embeddings saved to 'machine_embeddings_from_autoencoder.csv")


Epoch 1/50


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2537 - val_loss: 0.2496
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.2496 - val_loss: 0.2461
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2461 - val_loss: 0.2424
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2424 - val_loss: 0.2387
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.2387 - val_loss: 0.2352
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2352 - val_loss: 0.2316
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2316 - val_loss: 0.2280
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.2280 - val_loss: 0.2241
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms