In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re

# Load the dataset from the local file
file_path = "/content/dataset.json"
data = pd.read_json(file_path)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Handle missing values
data.dropna(inplace=True)

# Clean text data in the "externalStatus" column
data['externalStatus'] = data['externalStatus'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# Encode labels
label_encoder = LabelEncoder()
data['internalStatusEncoded'] = label_encoder.fit_transform(data['internalStatus'])

# Optionally, split the dataset into training and validation sets
X = data['externalStatus']
y = data['internalStatusEncoded']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the preprocessed data
print("Preprocessed Data:")
print(data.head())

# Export the preprocessed dataset to a CSV file
preprocessed_file_path = "preprocessed_container_events.csv"
data.to_csv(preprocessed_file_path, index=False)
print(f"Preprocessed dataset exported to {preprocessed_file_path}")

# If you want to use X_train, X_val, y_train, y_val for model training, proceed with the next steps accordingly.


FileNotFoundError: File /content/dataset.json does not exist

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
data = pd.read_csv("preprocessed_container_events.csv")

# Split the dataset into features (X) and target (y)
X = data['externalStatus']
y = data['internalStatusEncoded']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences to ensure uniform length
max_length = 100
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq, maxlen=max_length)
X_val_pad = tf.keras.preprocessing.sequence.pad_sequences(X_val_seq, maxlen=max_length)

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(data['internalStatusEncoded'].unique()), activation='softmax')
])


# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val))

# Evaluate the model
loss, accuracy = model.evaluate(X_val_pad, y_val)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')


# Save the model
model.save("trained_model.h5")



Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step - accuracy: 0.2079 - loss: 2.6672 - val_accuracy: 0.5000 - val_loss: 2.5206
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4241 - loss: 2.5203 - val_accuracy: 0.5000 - val_loss: 2.3305
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4123 - loss: 2.3691 - val_accuracy: 0.5000 - val_loss: 2.1375
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4280 - loss: 2.2109 - val_accuracy: 0.5000 - val_loss: 1.9611
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4162 - loss: 2.1005 - val_accuracy: 0.5000 - val_loss: 1.8479
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4631 - loss: 1.9976 - val_accuracy: 0.5000 - val_loss: 1.8084
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━



Validation Loss: 1.7964
Validation Accuracy: 0.5000


Collecting fastapi
  Downloading fastapi-0.110.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.1/92.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.28.1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.37.0,>=0.36.3 (from fastapi)
  Downloading starlette-0.36.3-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.5/71.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, uvicorn, starlette, fastapi
Successfully installed fastapi-0.110.0 h11-0.14.0 starlette-0.36.3 uvicorn-0.28.1


In [5]:
import pickle

# Fit tokenizer on training data
tokenizer.fit_on_texts(X_train)

# Save tokenizer to file
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [10]:
uvicorn app:app --reload

SyntaxError: invalid syntax (<ipython-input-10-7f5336593619>, line 1)