In [19]:
import import_ipynb
import preprocess_model as pre

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

In [21]:
# Load the data from the CSV file
df = pre.df

In [22]:
# pre.X_col = ['School enrollment, primary', 'School enrollment, tertiary', 'Primary completion rate', 'Year']
# pre.Country = ['pre.Country']
# y_target = 'Literacy rate'

# # Drop rows with missing target values
# df.dropna(subset=[y_target], inplace=True)

In [23]:
# # Split the data into training, validation, and test sets
# pre.X_train, pre.X_temp, pre.y_train, pre.y_temp = train_test_split(df[pre.X_col + pre.Country], df[y_target], test_size=0.4, random_state=42)
# pre.X_val, pre.X_test, pre.y_val, pre.y_test = train_test_split(pre.X_temp, pre.y_temp, test_size=0.5, random_state=42)

# print("Shapes after splitting:")
# print("pre.X_train:", pre.X_train.shape, "pre.y_train:", pre.y_train.shape)
# print("pre.X_val:", pre.X_val.shape, "pre.y_val:", pre.y_val.shape)
# print("pre.X_test:", pre.X_test.shape, "pre.y_test:", pre.y_test.shape)


In [24]:

# # Standardize the numerical features
# pre.scaler = StandardScaler()
# pre.X_train[pre.X_col] = pre.scaler.fit_transform(pre.X_train[pre.X_col])
# pre.X_val[pre.X_col] = pre.scaler.transform(pre.X_val[pre.X_col])
# pre.X_test[pre.X_col] = pre.scaler.transform(pre.X_test[pre.X_col])


In [25]:

# # Preprocess categorical features
# label_encoders = {}
# for country in pre.Country:
#     le = LabelEncoder()
#     pre.X_train[country] = le.fit_transform(pre.X_train[country])
#     pre.X_val[country] = le.transform(pre.X_val[country])
#     pre.X_test[country] = le.transform(pre.X_test[country])
#     label_encoders[country] = le


In [26]:

# Define the learning rate
learning_rate = 0.0001  


In [27]:

# Define the model
input_layers = []
embedding_layers = []

for country in pre.Country:
    input_layer = Input(shape=(1,), name=country)
    embedding_layer = Embedding(input_dim=len(df[country].unique()), output_dim=10)(input_layer)
    flatten_layer = Flatten()(embedding_layer)
    input_layers.append(input_layer)
    embedding_layers.append(flatten_layer)

input = Input(shape=(len(pre.X_col),), name='input')
embedding_layers.append(input)

# Concatenate all input layers
concatenated = Concatenate()(embedding_layers)

# Fully connected layers with dropout for regularization
x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
output_layer = Dense(1, activation='linear', name='output')(x)

# Model
model = Model(inputs=input_layers + [input], outputs=output_layer)

# Compile the model with Mean Squared Error loss and the specified learning rate
model.compile(optimizer=Adam(lr=learning_rate), loss='mean_squared_error') #soft-max reg

# Train the model for more epochs
history = model.fit([pre.X_train[country] for country in pre.Country] + [pre.X_train[pre.X_col]],
                    pre.y_train, epochs=200, batch_size=64,
                    validation_data=([pre.X_val[country] for country in pre.Country] + [pre.X_val[pre.X_col]], pre.y_val),
                    verbose=1)

# Evaluate the model on the test set
test_loss = model.evaluate([pre.X_test[country] for country in pre.Country] + [pre.X_test[pre.X_col]], pre.y_test)
print(f'Test Loss: {test_loss}')

# Plot the training history
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.show()



IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# Make predictions
test = ['School enrollment, primary', 'School enrollment, tertiary', 'Primary completion rate', 'Year', 'Country']
print(df[test])
new_data = pd.DataFrame({
    'School enrollment, primary': [100.911263,100.911262512207],
    'School enrollment, tertiary': [28.845509  ,28.8455085754395],
    'Primary completion rate': [96.636715, 96.63671493530276],
    'Year': [2023,2023],
    'Country': ["Afghanistan", "Japan"],
    # other columns as needed
})
# 100.911262512207,28.8455085754395,96.63671493530276
# Preprocess the new data similar to the training data
# Standardize numerical features
new_data[pre.X_col] = pre.scaler.transform(new_data[pre.X_col])

# Preprocess categorical features
for country in pre.Country:
    new_data[country] = pre.label_encoders[country].transform(new_data[country])

# # Make predictions on the new data
predictions_future = model.predict([new_data[country] for country in pre.Country] + [new_data[pre.X_col]])
print(predictions_future)

# predictions = model.predict([pre.X_test[country] for country in pre.Country] + [pre.X_test[pre.X_col]])
# print(pre.y_test)
# print(predictions)
# import seaborn as sns

# sns.kdeplot(pre.y_test, label='Actual', shade=True)
# sns.kdeplot(predictions.flatten(), label='Predicted', shade=True)
# plt.xlabel('Literacy rate')
# plt.title('Distribution of Actual vs. Predicted Literacy rate')
# plt.show()
# residuals = pre.y_test - predictions.flatten()

# # plt.scatter(pre.y_test, residuals)
# # plt.axhline(y=0, color='red', linestyle='--')
# # plt.xlabel('Actual Literacy rate')
# # plt.ylabel('Residuals')
# # plt.title('Residual Plot')
# # plt.show()

# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# mse = mean_squared_error(pre.y_test, predictions)
# mae = mean_absolute_error(pre.y_test, predictions)

# r2 = r2_score(pre.y_test, predictions)
# print(f'Mean Squared Error: {mse}')
# print(f'Mean Absolute Error: {mae}')
# print(f'R-squared: {r2}')


      School enrollment, primary  School enrollment, tertiary   
1                     100.911263                    28.845509  \
7                     106.182419                    28.845509   
11                    100.290298                     3.755610   
30                     96.620567                    53.306129   
33                    101.619812                    53.590759   
...                          ...                          ...   
1263                  117.423347                    28.506929   
1273                   97.419968                    21.506201   
1282                  112.737328                     9.135250   
1283                  114.000847                    10.333910   
1306                   80.937019                     8.826330   

      Primary completion rate  Year      Country  
1                   96.636715  2021  Afghanistan  
7                   96.636715  2015  Afghanistan  
11                  96.636715  2011  Afghanistan  
30             

ValueError: y contains previously unseen labels: 'Japan'