In [6]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load data from CSV
file_path = 'Machine.csv'
data = pd.read_csv(file_path)

# Assuming the first column contains text
text_data = data.iloc[:, 0]
rest_of_data = data.iloc[:, 1:]

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features
tfidf_features = vectorizer.fit_transform(text_data).toarray()

# Building the autoencoder˜
input_dim = tfidf_features.shape[1]
encoding_dim = 10  # Embeddings size

input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)
autoencoder = Model(input_layer, decoded)

# Encoder for extracting embeddings
encoder = Model(input_layer, encoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(tfidf_features, tfidf_features, epochs=50, batch_size=256, validation_split=0.2)

# Extract embeddings
embeddings = encoder.predict(tfidf_features)

# Combine embeddings with the rest of the data
embeddings_df = pd.DataFrame(embeddings, columns=[f'emb_{i+1}' for i in range(encoding_dim)])
final_data = pd.concat([embeddings_df, rest_of_data.reset_index(drop=True)], axis=1)

# Save the combined data to CSV
final_data.to_csv('machine_embeddings_from_autoencoder.csv', index=False)

print("Data with embeddings saved to 'combined_data.csv'")

Epoch 1/50


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.2407 - val_loss: 0.2383
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2390 - val_loss: 0.2366
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2373 - val_loss: 0.2350
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.2355 - val_loss: 0.2332
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2336 - val_loss: 0.2313
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2316 - val_loss: 0.2293
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.2295 - val_loss: 0.2272
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.2272 - val_loss: 0.2250
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms