In [1]:
%%capture
!pip install transformers
!pip install sentence-transformers
!pip install faiss-cpu
!pip install torch
!pip install datasets
!pip install bitsandbytes

In [7]:
import pandas as pd
import torch
import warnings
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

warnings.filterwarnings('ignore')

In [11]:
# Suppress warnings and install required libraries
import warnings

warnings.filterwarnings('ignore')

# Cell to install dependencies
install_dependencies = """
%%capture
!pip install transformers sentence-transformers faiss-cpu torch datasets bitsandbytes
"""

# Import necessary modules
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from datasets import load_dataset, Dataset

In [12]:
# Load and preprocess dataset
dataset_path = "/kaggle/input/dummydataset"
full_dataset = load_dataset(dataset_path)

# Extract and clean the training data
training_data = full_dataset['train']
df = pd.DataFrame(training_data)
cleaned_df = df.dropna(subset=['Age'])
processed_dataset = Dataset.from_pandas(cleaned_df)

# Sample 2% of the data randomly
sample_size = int(len(processed_dataset) * 0.02)
sampled_data = processed_dataset.shuffle(seed=42).select(range(sample_size))

# Display subset size and a sample entry
print(f"Subset training data size: {len(sampled_data)}")
print(sampled_data[0])

Subset training data size: 100
{'Unnamed: 0': 1535, 'Name': 'qTFDaBFm', 'Address': 'swkgswND St, aXjfjnUQDX, CHN 822717', 'Salary': 134390.36, 'DOJ': '2008-11-07', 'DOB': '1971-12-09', 'Age': 52, 'Sex': 'Male', 'Dependents': 4.0, 'HRA': 9906.964, 'DA': 31717.43289537712, 'PF': 19932.93514744525, 'Gross Salary': 156081.82174793188, 'Insurance': 'Both', 'Marital Status': 'Married', 'In Company Years': 15, 'Year of Experience': 31, 'Department': 'IT', 'Position': 'Technical Lead'}


In [13]:
# Import Hugging Face Hub to access the models
from huggingface_hub import login

# Access token for authentication
access_token = "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"

# Load the LLAMA model and tokenizer with 4-bit quantization
llama_model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    token=access_token,
    load_in_4bit=True,
    output_hidden_states=True
)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name, token=access_token)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
# Function to extract embeddings from specified model layers
def get_layer_embeddings(text, model, tokenizer, layer):
    tokenized_input = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokenized_input, output_hidden_states=True)
    all_layers = output.hidden_states
    if not all_layers or layer >= len(all_layers):
        raise ValueError("Invalid layer index or no hidden states returned by the model.")
    return all_layers[layer][0, -1, :].cpu().numpy()  # Return final token's embedding

# Extract embeddings for different layers of interest
target_layers = [0, 16, 31]  # Example layers: first, middle, last
layer_embeddings = {layer: [] for layer in target_layers}

for entry in sampled_data:
    incident_info = (
        f"Name: {entry['Name']} \n"
        f"Address: {entry['Address']} \n"
        f"Salary: {entry['Salary']} \n"
        f"DOJ: {entry['DOJ']} \n"
        f"Age: {entry['Age']} \n"
        f"Sex: {entry['Sex']} \n"
        f"Insurance: {entry['Insurance']} \n"
        f"Marital Status: {entry['Marital Status']} \n"
        f"Position: {entry['Position']} \n"
        f"Sentiment:"
    )
    for layer in target_layers:
        try:
            embedding = get_layer_embeddings(incident_info, llama_model, llama_tokenizer, layer)
            layer_embeddings[layer].append(embedding)
        except ValueError as e:
            print(f"Error extracting embeddings from layer {layer}: {e}")

# Convert embeddings to numpy arrays
for layer in target_layers:
    layer_embeddings[layer] = np.array(layer_embeddings[layer])

In [15]:
# Import libraries for classification and evaluation
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Prepare feature sets and labels for classification
X_features = {layer: torch.tensor(layer_embeddings[layer]) for layer in target_layers}
y_labels = [sample['Sex'] for sample in sampled_data]

# Encode class labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_labels)

# Train Random Forest classifiers for each layer's embeddings
layer_classifiers = {}
for layer in target_layers:
    rf_classifier = RandomForestClassifier()
    rf_classifier.fit(X_features[layer], y_encoded)
    layer_classifiers[layer] = rf_classifier

# Evaluate each classifier
for layer in target_layers:
    y_predicted = layer_classifiers[layer].predict(X_features[layer])
    accuracy = accuracy_score(y_encoded, y_predicted)
    print(f"Accuracy for layer {layer}: {accuracy}")

Accuracy for layer 0: 0.36
Accuracy for layer 16: 1.0
Accuracy for layer 31: 1.0


In [16]:
# Regression models for numerical prediction (e.g., Year)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Extract and preprocess Year data for regression
y_age = [sample['Age'] for sample in sampled_data]
y_numeric = np.array([int(age) for age in y_age])

# Train and evaluate linear regression models
regression_models = {}
for layer in target_layers:
    regressor = LinearRegression()
    regressor.fit(layer_embeddings[layer], y_numeric)
    regression_models[layer] = regressor

# Compute and print Mean Squared Error for each layer
for layer in target_layers:
    y_pred = regression_models[layer].predict(layer_embeddings[layer])
    mse = mean_squared_error(y_numeric, y_pred)
    print(f"Mean Squared Error for layer {layer}: {mse}")

Mean Squared Error for layer 0: 131.8572265625
Mean Squared Error for layer 16: 5.37109375e-05
Mean Squared Error for layer 31: 0.0


**Observations:**

1. Layer 0 shows significantly lower accuracy and higher MSE compared to Layer 16 and Layer 31. This suggests that the embeddings from Layer 0 do not effectively capture or encode the relevant information needed for accurate predictions. The high MSE indicates large errors in predictions when using these embeddings.

2. Layer 16 and Layer 31 both achieve perfect accuracy (1.0) and have very low to zero MSE, indicating that the embeddings at these layers are highly effective in predicting the target fields. The perfect accuracy and minimal MSE suggest that these layers have successfully encoded the necessary information for the task.

**Comparison Across Different Layers**

**1. Initial Layer (Layer 0):**

Performance: Poor
Reason: Initial layers might capture more generic, less task-specific features which are not yet refined or sufficiently informative for accurate predictions.

**2. Mid-Layer (Layer 16):**

Performance: Excellent
Reason: Mid-layers often capture more abstract, task-relevant features as they are a product of both initial transformations and later refinements. This layer seems to be well-tuned for the prediction task, as indicated by perfect accuracy and low MSE.

**3. Final Layer (Layer 31):**

Performance: Excellent
Reason: Final layers are expected to have the most refined embeddings with the best representation of the data for the specific task, leading to perfect accuracy and zero MSE in this case.

**Discussion**
**Findings:**

**1. Encoding Information:**

The significant improvement in performance from Layer 0 to Layers 16 and 31 reflects the LLM's ability to progressively encode and refine information. Initial layers (Layer 0) lack the depth and specificity needed for accurate prediction, while mid and final layers (Layer 16 and Layer 31) appear to encode more relevant and refined features.

**2. Patterns and Anomalies:**

The performance jump from Layer 0 to Layers 16 and 31 suggests a clear learning progression within the model. Initial layers may serve as a foundation, but it's in the intermediate to final layers where complex patterns and relationships are captured. The consistent performance in Layers 16 and 31 indicates that these layers are optimized for the task at hand, showing that they have been effectively trained to encode the target information.

**3. Implications:**

The results indicate that to achieve high accuracy in predictions, embeddings from deeper layers of the model are crucial. This is consistent with the common understanding that deeper layers of neural networks capture more abstract and task-specific representations. The lack of variance in performance between Layer 16 and Layer 31 suggests that, for this specific task, once the embeddings are sufficiently refined, further layers might not substantially alter the performance.