In [2]:
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Function to preprocess data
def preprocess_data(data, categorical_columns, numerical_columns, encoders, scaler):
    # Process datetime columns (example: 'Policy Start Date')
    if 'Policy Start Date' in data.columns:
        data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
        data['policy_year'] = data['Policy Start Date'].dt.year
        data['policy_month'] = data['Policy Start Date'].dt.month
        data['policy_day'] = data['Policy Start Date'].dt.day
        data.drop(columns=['Policy Start Date'], inplace=True)

    # Fill missing values
    data[categorical_columns] = data[categorical_columns].fillna('missing')
    data[numerical_columns] = data[numerical_columns].apply(lambda col: col.fillna(col.mean()))

    # Apply ordinal encoding
    for col, encoder in encoders.items():
        data[col] = encoder.transform(data[[col]])

    # Scale numerical columns
    data[numerical_columns] = scaler.transform(data[numerical_columns])

    return data

# Load data
data = pd.read_csv("train.csv").drop("id", axis=1)

# Define columns
categorical_columns = ['Gender', 'Education Level', 'Occupation', 'Location', 'Customer Feedback',
                       'Marital Status', 'Exercise Frequency', 'Smoking Status', 'Property Type', 'Policy Type']
numerical_columns = [col for col in data.columns if col not in categorical_columns + ['Premium Amount']]

# Define ordinal categories
ordinal_categories = [
    ['PhD', "Master's", "Bachelor's", 'High School'],
    ['Basic', 'Comprehensive', 'Premium'],
    ['missing', 'Poor', 'Average', 'Good'],
    ['Condo', 'Apartment', 'House'],
    ['missing', 'Employed', 'Self-Employed', 'Unemployed'],
    ['Daily', 'Weekly', 'Monthly', 'Rarely'],
    ['Urban', 'Suburban', 'Rural'],
    ['missing', 'Married', 'Divorced', 'Single'],
    ['No', 'Yes'],
    ['Female', 'Male']
]

# Initialize encoders
encoders = {}
for col, categories in zip(categorical_columns, ordinal_categories):
    encoders[col] = OrdinalEncoder(categories=[categories], handle_unknown='use_encoded_value', unknown_value=-1)

# Fit encoders on training data
for col, encoder in encoders.items():
    data[col] = encoder.fit_transform(data[[col]])

# Scale numerical columns and target separately
scaler = MinMaxScaler()
premium_scaler = MinMaxScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
data['Premium Amount'] = premium_scaler.fit_transform(data[['Premium Amount']])

# Split data into training and testing sets
X = data.drop(columns=['Premium Amount'])
y = data['Premium Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Build the neural network model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Input layer + hidden layer
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dense(1))  # Output layer (no activation function for regression)

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print("RMSLE:", rmsle)

# Load and preprocess test data
test_data = pd.read_csv("test.csv").drop("id", axis=1)
test_data = preprocess_data(test_data, categorical_columns, numerical_columns, encoders, scaler)

# Make predictions
predictions = model.predict(test_data)
predictions = predictions.reshape(-1, 1)
original_predictions = premium_scaler.inverse_transform(predictions)

# Save predictions to a CSV file
output = pd.DataFrame({'id': range(1, len(original_predictions) + 1), 'Premium Amount': original_predictions.flatten()})
output.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'")


ValueError: could not convert string to float: '2023-12-23 15:21:39.134960'