In [14]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import scipy.sparse as sp
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Nadam

# Step 1: Load the training and test datasets
train_data = pd.read_csv('/content/drive/MyDrive/login/Train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/login/Test.csv')

# Step 2: Clean and preprocess ENTITY_DESCRIPTION
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    # Convert to lowercase and strip whitespace
    text = text.lower().strip()
    return text

train_data['ENTITY_DESCRIPTION_CLEAN'] = train_data['ENTITY_DESCRIPTION'].apply(clean_text)
test_data['ENTITY_DESCRIPTION_CLEAN'] = test_data['ENTITY_DESCRIPTION'].apply(clean_text)

# Step 3: Additional feature - Length of the description
train_data['DESCRIPTION_LENGTH'] = train_data['ENTITY_DESCRIPTION_CLEAN'].apply(lambda x: len(x.split()))
test_data['DESCRIPTION_LENGTH'] = test_data['ENTITY_DESCRIPTION_CLEAN'].apply(lambda x: len(x.split()))

# Step 4: Vectorize ENTITY_DESCRIPTION using TF-IDF
tfidf = TfidfVectorizer(max_features=300, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_data['ENTITY_DESCRIPTION_CLEAN'])
test_tfidf = tfidf.transform(test_data['ENTITY_DESCRIPTION_CLEAN'])

# Step 5: Scale ENTITY_LENGTH and DESCRIPTION_LENGTH separately
scaler_length = StandardScaler()
train_data['ENTITY_LENGTH_SCALED'] = scaler_length.fit_transform(train_data[['ENTITY_LENGTH']])

scaler_description = StandardScaler()
train_data['DESCRIPTION_LENGTH_SCALED'] = scaler_description.fit_transform(train_data[['DESCRIPTION_LENGTH']])
test_data['DESCRIPTION_LENGTH_SCALED'] = scaler_description.transform(test_data[['DESCRIPTION_LENGTH']])

# Step 6: Combine TF-IDF features and additional features
# Convert sparse matrices to dense only if memory allows
try:
    train_combined = sp.hstack([train_tfidf, train_data[['DESCRIPTION_LENGTH_SCALED']]]).toarray()
    test_combined = sp.hstack([test_tfidf, test_data[['DESCRIPTION_LENGTH_SCALED']]]).toarray()
except MemoryError:
    print("MemoryError: Consider reducing 'max_features' in TfidfVectorizer or using PCA.")

# Optional Step 7: Apply PCA for dimensionality reduction (if needed)
# Uncomment the following lines if you face memory issues
# pca = PCA(n_components=100, random_state=42)
# train_combined = pca.fit_transform(train_combined)
# test_combined = pca.transform(test_combined)

# Prepare the input features and target variable
X_train = train_combined
y_train = train_data['ENTITY_LENGTH_SCALED']

# Step 8: Split the data for training and validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Step 9: Define the optimized neural network model
def create_optimized_model(input_dim):
    model = Sequential()

    # Input Layer
    model.add(Dense(256, input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))

    # Hidden Layer 1
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))

    # Hidden Layer 2
    model.add(Dense(64))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))

    # Hidden Layer 3
    model.add(Dense(32))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.3))

    # Output Layer
    model.add(Dense(1))

    # Compile the model with Nadam optimizer and a suitable learning rate
    optimizer = Nadam(learning_rate=0.002)
    model.compile(optimizer=optimizer, loss='mean_absolute_percentage_error')

    return model

# Step 10: Create the model
input_dim = X_tr.shape[1]
model = create_optimized_model(input_dim)

# Step 11: Define callbacks for early stopping and learning rate reduction
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

# Step 12: Train the neural network model
print("Training the neural network model...")
history = model.fit(
    X_tr, y_tr,
    epochs=10,  # As requested
    batch_size=256,  # Larger batch size for speed
    validation_data=(X_val, y_val),
    callbacks=[early_stop, lr_reduce],
    verbose=1
)

# Step 13: Predict on the validation set and test set
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(test_combined)

# Step 14: Inverse transform the predictions for ENTITY_LENGTH
predictions_original_scale = scaler_length.inverse_transform(y_test_pred).flatten()

# Step 15: Save predictions to a CSV file
test_data['PREDICTED_ENTITY_LENGTH'] = predictions_original_scale
test_data[['ENTITY_ID', 'PREDICTED_ENTITY_LENGTH']].to_csv('neural_network_predictions_optimized.csv', index=False)
print("Predictions saved to 'neural_network_predictions_optimized.csv'.")

# Step 16: Calculate MAPE for the validation data
mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Mean Absolute Percentage Error (MAPE) on Validation Data: {mape:.2f}%")


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training the neural network model...
Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - loss: 429.2105 - val_loss: 120.0439 - learning_rate: 0.0020
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - loss: 172.2469 - val_loss: 177.1684 - learning_rate: 0.0020
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - loss: 135.8914 - val_loss: 110.1813 - learning_rate: 0.0020
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - loss: 115.6129 - val_loss: 124.3136 - learning_rate: 0.0020
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - loss: 112.5335 - val_loss: 109.3110 - learning_rate: 0.0020
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - loss: 103.8613 - val_loss: 115.9606 - learning_rate: 0.0020
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [15]:
from google.colab import files

# Download the predictions CSV file
files.download('neural_network_predictions_optimized.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>