In [11]:
from transformers import AutoTokenizer, AutoModel
from huggingface_hub import login

login(token="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")

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.36s/it]


In [12]:
def design_prompt_classification(country):
  return f"Tell me about the Human Development Groups (Low, Medium, High, Very High) of {country}"

def get_hidden_states(prompt):
    inputs = tokenizer(prompt, return_tensors = 'pt')
    
    with torch.no_grad():
        output = model(**inputs , output_hidden_states=True)
        hidden_states = output.hidden_states
    
    first_layer_embedding = hidden_states[0][0, -1].numpy()
    middle_layer_index = len(hidden_states) // 2
    middle_layer_embedding = hidden_states[middle_layer_index][0, -1].numpy()
    final_layer_embedding = hidden_states[-1][0, -1].numpy()
    
    return first_layer_embedding, middle_layer_embedding, final_layer_embedding

In [13]:
import pandas as pd
from transformers import pipeline
import torch
import numpy as np

file_path = "/kaggle/input/human-development-index-dataset/Human Development Index - Full.csv"

df = pd.read_csv(file_path)

labels_df = df[['Country' , 'Human Development Groups']]

first_layer_embeddings = {}
middle_layer_embeddings = {}
final_layer_embeddings = {}

for index, row in df.iterrows():
    country = row['Country']
    prompt = design_prompt_classification(country)
    first_layer_embedding, middle_layer_embedding, final_layer_embedding = get_hidden_states(prompt)
    first_layer_embeddings[country] = first_layer_embedding
    middle_layer_embeddings[country] = middle_layer_embedding
    final_layer_embeddings[country] = final_layer_embedding

In [14]:
def flatten_embedding(embedding):
    return embedding.flatten()

first_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in first_layer_embeddings.items()}
middle_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in middle_layer_embeddings.items()}
final_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in final_layer_embeddings.items()}

def embeddings_to_dataframe(embeddings_dict, layer_name):
    df = pd.DataFrame.from_dict(embeddings_dict, orient='index')
    df.reset_index(inplace=True)
    df.columns = ['Country'] + [f'{layer_name}_Feature_{i}' for i in range(4096)]
    return df

first_layer_df = embeddings_to_dataframe(first_layer_embeddings, 'FirstLayer')
middle_layer_df = embeddings_to_dataframe(middle_layer_embeddings, 'MiddleLayer')
final_layer_df = embeddings_to_dataframe(final_layer_embeddings, 'FinalLayer')

In [15]:
labels_df = df[['Country', 'Human Development Groups']]

first_layer_combined_df = pd.merge(first_layer_df, labels_df, on='Country', how='left')
middle_layer_combined_df = pd.merge(middle_layer_df, labels_df, on='Country', how='left')
final_layer_combined_df = pd.merge(final_layer_df, labels_df, on='Country', how='left')

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def prepare_and_train(df_combined):
    X = df_combined.drop(columns=['Country', 'Human Development Groups']).values
    y = df_combined['Human Development Groups']

    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=0)

    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

first_layer_accuracy = prepare_and_train(first_layer_combined_df)
middle_layer_accuracy = prepare_and_train(middle_layer_combined_df)
final_layer_accuracy = prepare_and_train(final_layer_combined_df)

print(f'First Layer Accuracy: {first_layer_accuracy:.2f}')
print(f'Middle Layer Accuracy: {middle_layer_accuracy:.2f}')
print(f'Final Layer Accuracy: {final_layer_accuracy:.2f}')

First Layer Accuracy: 0.47
Middle Layer Accuracy: 0.68
Final Layer Accuracy: 0.64


In [17]:
def design_prompt_regression(country):
  return f"Tell me about the Human Development Index value of {country} for 2021"

In [18]:
first_layer_embeddings = {}
middle_layer_embeddings = {}
final_layer_embeddings = {}

for index, row in df.iterrows():
    country = row['Country']
    prompt = design_prompt_regression(country)
    first_layer_embedding, middle_layer_embedding, final_layer_embedding = get_hidden_states(prompt)
    first_layer_embeddings[country] = first_layer_embedding
    middle_layer_embeddings[country] = middle_layer_embedding
    final_layer_embeddings[country] = final_layer_embedding

first_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in first_layer_embeddings.items()}
middle_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in middle_layer_embeddings.items()}
final_layer_flat_embeddings = {key: flatten_embedding(val) for key, val in final_layer_embeddings.items()}

first_layer_df = embeddings_to_dataframe(first_layer_embeddings, 'FirstLayer')
middle_layer_df = embeddings_to_dataframe(middle_layer_embeddings, 'MiddleLayer')
final_layer_df = embeddings_to_dataframe(final_layer_embeddings, 'FinalLayer')

In [19]:
regression_df = df[['Country', 'Human Development Index (2021)']]  # Replace 'GDP per capita' with your chosen attribute

first_layer_combined_df = pd.merge(first_layer_df, regression_df, on='Country', how='left')
middle_layer_combined_df = pd.merge(middle_layer_df, regression_df, on='Country', how='left')
final_layer_combined_df = pd.merge(final_layer_df, regression_df, on='Country', how='left')

first_layer_combined_df = first_layer_combined_df.dropna(subset=['Human Development Index (2021)'])
middle_layer_combined_df = middle_layer_combined_df.dropna(subset=['Human Development Index (2021)'])
final_layer_combined_df = final_layer_combined_df.dropna(subset=['Human Development Index (2021)'])

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def prepare_and_train_regression(df_combined):
    X = df_combined.drop(columns=['Country', 'Human Development Index (2021)']).values
    y = df_combined['Human Development Index (2021)']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2score = r2_score(y_test, y_pred)
    return mse, r2score

first_layer_mse, first_layer_r2 = prepare_and_train_regression(first_layer_combined_df)
middle_layer_mse, middle_layer_r2 = prepare_and_train_regression(middle_layer_combined_df)
final_layer_mse, final_layer_r2 = prepare_and_train_regression(final_layer_combined_df)

print(f'First Layer MSE: {first_layer_mse:.2f}')
print(f'Middle Layer MSE: {middle_layer_mse:.2f}')
print(f'Final Layer MSE: {final_layer_mse:.2f}')

print(f'First Layer R2 Score: {first_layer_r2:.2f}')
print(f'Middle Layer R2 Score: {middle_layer_r2:.2f}')
print(f'Final Layer R2 Score: {final_layer_r2:.2f}')

First Layer MSE: 0.02
Middle Layer MSE: 0.00
Final Layer MSE: 0.00
First Layer R2 Score: -0.00
Middle Layer R2 Score: 0.93
Final Layer R2 Score: 0.85
