In [11]:
import import_ipynb
import preprocess_model as pre

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

In [13]:
df = pre.df_fnn


# Define the learning rate


In [None]:
learning_rate = 0.0001  


# Define the model


In [None]:
input_layers = []
embedding_layers = []

for country in pre.Country:
    input_layer = Input(shape=(1,), name=country)
    embedding_layer = Embedding(input_dim=len(df[country].unique()), output_dim=10)(input_layer)
    flatten_layer = Flatten()(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

input = Input(shape=(len(pre.X_col),), name='input')
embedding_layers.append(input)



# Concatenate all input layers


In [None]:
concatenated = Concatenate()(embedding_layers)



# Fully connected layers with dropout for regularization


In [None]:
x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
output_layer = Dense(1, activation='linear', name='output')(x)



# Model


In [None]:
model = Model(inputs=input_layers + [input], outputs=output_layer)



# Compile the model with Mean Squared Error loss and the specified learning rate


In [None]:
model.compile(optimizer=Adam(lr=learning_rate), loss='mean_squared_error') #soft-max reg



# Train the model for more epochs


In [None]:
history = model.fit([pre.X_train[country] for country in pre.Country] + [pre.X_train[pre.X_col]],
                    pre.y_train, epochs=200, batch_size=64,
                    validation_data=([pre.X_val[country] for country in pre.Country] + [pre.X_val[pre.X_col]], pre.y_val),
                    verbose=1)



# Evaluate the model on the test set


In [None]:
test_loss = model.evaluate([pre.X_test[country] for country in pre.Country] + [pre.X_test[pre.X_col]], pre.y_test)
print(f'Test Loss: {test_loss}')



# Plot the training history


In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()

## Make predictions


In [None]:
# test = ['School enrollment, primary', 'School enrollment, tertiary', 'Primary completion rate', 'Year', 'Country']
# # print(df[test])
# new_data = pd.DataFrame({
#     'School enrollment, primary': [100.911263,100.911262512207],
#     'School enrollment, tertiary': [28.845509  ,28.8455085754395],
#     'Primary completion rate': [96.636715, 96.63671493530276],
#     'Year': [2023,2023],
#     'Country': ["Afghanistan", "Japan"],
# })

# # 100.911262512207,28.8455085754395,96.63671493530276
# # Preprocess the new data similar to the training data
# # Standardize numerical features
# new_data[pre.X_col] = pre.scaler.transform(new_data[pre.X_col])

# # Preprocess
# for country in pre.Country:
#     new_data[country] = pre.label_encoders[country].transform(new_data[country])

# # # Make predictions on the new data
# predictions_future = model.predict([new_data[country] for country in pre.Country] + [new_data[pre.X_col]])
# print(predictions_future)

predictions = model.predict([pre.X_test[country] for country in pre.Country] + [pre.X_test[pre.X_col]])
# print(pre.y_test)
print(predictions)
# import seaborn as sns

# sns.kdeplot(pre.y_test, label='Actual', shade=True)
# sns.kdeplot(predictions.flatten(), label='Predicted', shade=True)
# plt.xlabel('Literacy rate')
# plt.ylabel('Literacy rate')
# plt.title('Distribution of Actual vs. Predicted Literacy rate')
# plt.show()
# residuals = pre.y_test - predictions.flatten()

# plt.scatter(pre.y_test, residuals)
# plt.axhline(y=0, color='red', linestyle='--')
# plt.xlabel('Actual Literacy rate')
# plt.ylabel('Residuals')
# plt.title('Residual Plot')
# plt.show()


mse = mean_squared_error(pre.y_test, predictions)
mae = mean_absolute_error(pre.y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
