In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import accelerate

from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [58]:
dataset = pd.read_csv('../data/16k_Movies.csv')
dataset.drop(columns=['Unnamed: 0'], inplace=True)
dataset.dropna(inplace=True)
dataset.drop_duplicates(subset=['Title'], inplace=True)
dataset.reset_index(drop=True, inplace=True)
dataset = dataset.iloc[:100]
dataset

Unnamed: 0,Title,Release Date,Description,Rating,No of Persons Voted,Directed by,Written by,Duration,Genres
0,Dekalog (1988),"Mar 22, 1996",This masterwork by Krzysztof Kieślowski is one...,7.4,118,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz",9 h 32 m,Drama
1,Three Colors: Red,"Nov 23, 1994",Krzysztof Kieslowski closes his Three Colors t...,8.3,241,Krzysztof Kieslowski,"Krzysztof Kieslowski, Krzysztof Piesiewicz, Ag...",1 h 39 m,"Drama,Mystery,Romance"
2,The Conformist,"Oct 22, 1970","Set in Rome in the 1930s, this re-release of B...",7.3,106,Bernardo Bertolucci,"Alberto Moravia, Bernardo Bertolucci",1 h 47 m,Drama
3,Tokyo Story,"Mar 13, 1972",Yasujiro Ozu’s Tokyo Story follows an aging co...,8.1,147,Yasujirô Ozu,"Kôgo Noda, Yasujirô Ozu",2 h 16 m,Drama
4,The Leopard (re-release),"Aug 13, 2004","Set in Sicily in 1860, Luchino Visconti's spec...",7.8,85,Luchino Visconti,"Giuseppe Tomasi di Lampedusa, Suso Cecchi D'Am...",3 h 7 m,"Drama,History"
...,...,...,...,...,...,...,...,...,...
95,Sita Sings the Blues,"Dec 25, 2009",Sita is a goddess separated from her beloved L...,7.2,48,Nina Paley,"Nina Paley, Valmiki",1 h 22 m,"Animation,Comedy,Fantasy,Musical,Romance"
96,Badlands,"Oct 15, 1973",Loosely based on the Starkweather-Fugate killi...,8.3,178,Terrence Malick,Terrence Malick,1 h 34 m,"Action,Crime,Drama"
97,Sankofa,"May 28, 1993",Mona (Oyafunmike Ogunlano) is a Black American...,5.9,7,Haile Gerima,Haile Gerima,2 h 5 m,Drama
98,No Bears,"Dec 23, 2022",Two parallel love stories in which the partner...,7.5,22,Jafar Panahi,Jafar Panahi,1 h 46 m,"Drama,Romance"


In [59]:
dataset.columns

Index(['Title', 'Release Date', 'Description', 'Rating', 'No of Persons Voted',
       'Directed by', 'Written by', 'Duration', 'Genres'],
      dtype='object')

In [64]:
for i, director in enumerate(dataset['Directed by']):
    dataset.loc[i, 'Directed by'] = director.strip()
    if '\n' in director:
        dataset.loc[i, 'Directed by'] = director.split('\n')[0].strip()

In [65]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=['Directed by']), dataset['Directed by'], test_size=.2)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
y_train = y_train.astype(str)
y_test = y_test.astype(str)

In [66]:
print(f'No. of training samples: {len(X_train)}')
print(f'No. of testing samples: {len(X_test)}')
print(f'No. of features: {X_train.shape[1]}')
print(f'Features: {X_train.columns.to_list()}')
print(f'No. of classes: {len(y_train.unique())}')
print(f'Classes: {y_train.unique()}')

No. of training samples: 80
No. of testing samples: 20
No. of features: 8
Features: ['Title', 'Release Date', 'Description', 'Rating', 'No of Persons Voted', 'Written by', 'Duration', 'Genres']
No. of classes: 70
Classes: ['Jean-Pierre Melville' 'Hirokazu Koreeda' 'Martin Scorsese'
 'Marcel Carné' 'Julien Duvivier' 'Laura Citarella' 'Noah Baumbach'
 'Haile Gerima' 'Tom McCarthy' 'Andrew Haigh' 'Damien Chazelle'
 'Gillo Pontecorvo' 'Terrence Malick' 'Michael Haneke' 'Fritz Lang'
 'Alfonso Cuarón' 'Isao Takahata' 'Yasujirô Ozu' 'Céline Sciamma'
 'Christopher Nolan' 'Krzysztof Kieslowski' 'Jennifer Peedom'
 'Barbet Schroeder' 'Charlotte Wells' 'Peter Jackson' 'Ethan Coen,'
 'Luchino Visconti' 'Bong Joon Ho' 'George Lucas' 'Spike Lee'
 'Charles Burnett' 'Steve McQueen' 'Greta Gerwig' 'Kathryn Bigelow'
 'Nicolas Roeg' 'Kar-Wai Wong' 'Luca Guadagnino' 'Radu Jude' 'JR,'
 'David Lean' 'Paul Thomas Anderson' 'Brad Bird,' 'Steven Spielberg'
 'Olivier Assayas' 'Ang Lee' 'Claire Simon' 'Jules Dass

In [98]:
# y_train = y_train.astype(int).to_numpy()
# y_test = y_test.astype(int).to_numpy()

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type='nf8',  # Can be 'nf4' or 'fp4'
    bnb_8bit_compute_dtype=torch.bfloat16  # Adjust compute type if needed
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map='auto',
    quantization_config=quantization_config,
    output_hidden_states=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [7]:
def get_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    total_size_bytes = total_params * 2
    total_size_gb = total_size_bytes / (1024 ** 3)
    print(f"Model size: {total_size_gb:.2f} GB")

get_model_size(model)

Model size: 14.96 GB


In [8]:
model.device

device(type='cuda', index=0)

In [9]:
def prompt_llm(prompt, max_tokens=20):
    messages = [
        {"role": "system", "content": "You are chatbot that tells who is the director of the given film."},
        {"role": "user", "content": prompt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=max_tokens,
            eos_token_id=terminators,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            output_hidden_states=True,  # Enable hidden states
            return_dict_in_generate=True  # Ensure hidden states are included in the output
        )

    generated_tokens = outputs.sequences[0][input_ids.shape[-1]:]
    
    hidden_states = outputs.hidden_states  # Shape: (num_layers, batch_size, seq_len, hidden_size)
    num_layers = len(hidden_states)
    # return hidden_states
    
    # Last Token Embedding
    first_layer_hidden_state = hidden_states[-1][0][:, -1, :].squeeze().cpu().numpy()
    middle_layer_hidden_state = hidden_states[-1][num_layers // 2][:, -1, :].squeeze().cpu().numpy()
    last_layer_hidden_state = hidden_states[-1][-1][:, -1, :].squeeze().cpu().numpy()

    response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    del input_ids
    torch.cuda.empty_cache()

    return {
        'prompt': prompt,
        'response': response_text,
        'first_layer_embedding': first_layer_hidden_state,  # Embedding from the first layer
        'middle_layer_embedding': middle_layer_hidden_state,  # Embedding from the middle layer
        'last_layer_embedding': last_layer_hidden_state  # Embedding from the last layer
    }

In [69]:
dataset.iloc[10]

Title                                                    Army of Shadows
Release Date                                                Apr 28, 2006
Description            Making its U.S. debut, Jean-Pierre Melville's ...
Rating                                                               7.7
No of Persons Voted                                                   67
Directed by                                         Jean-Pierre Melville
Written by                           Joseph Kessel, Jean-Pierre Melville
Duration                                                        2 h 25 m
Genres                                                         Drama,War
Name: 10, dtype: object

In [71]:
prompt_llm(f"Who is the director of the film '{dataset.iloc[10]['Title']}' released on {dataset.iloc[10]['Release Date']}?", max_tokens=40)['response']

"The director of the film 'Army of Shadows' released on Apr 28, 2006 is Jean-Pierre Melville."

In [None]:
for x in X_train.iterrows():
    prompt = f"{', '.join([': '.join(i) for i in zip(x[1].index, x[1].values)])}"
    output = prompt_llm(prompt)
    print(output)
    print(len(output))
    print(len(output[0]))
    print(output[0][0].shape)
    break

In [76]:
train_embeddings_first = []
train_embeddings_middle = []
train_embeddings_last = []
train_prompts = []
train_responses = []

for x in tqdm(X_train.iterrows(), total=len(X_train), desc='Generating embeddings for training data'):
    prompt = f"Who is the director of the film '{x[1]['Title']}' released on {x[1]['Release Date']}?"
    output = prompt_llm(prompt)
    train_prompts.append(output['prompt'])
    train_responses.append(output['response'])
    train_embeddings_first.append(output['first_layer_embedding'])
    train_embeddings_middle.append(output['middle_layer_embedding'])
    train_embeddings_last.append(output['last_layer_embedding'])

train_embeddings_first = np.array(train_embeddings_first)
train_embeddings_middle = np.array(train_embeddings_middle)
train_embeddings_last = np.array(train_embeddings_last)

Generating embeddings for training data: 100%|██████████| 80/80 [05:04<00:00,  3.81s/it]


In [77]:
test_embeddings_first = []
test_embeddings_middle = []
test_embeddings_last = []
test_prompts = []
test_responses = []

for x in tqdm(X_test.iterrows(), total=len(X_test), desc='Generating embeddings for testing data'):
    prompt = f"Who is the director of the film '{x[1]['Title']}' released on {x[1]['Release Date']}?"
    output = prompt_llm(prompt)
    test_prompts.append(output['prompt'])
    test_responses.append(output['response'])
    test_embeddings_first.append(output['first_layer_embedding'])
    test_embeddings_middle.append(output['middle_layer_embedding'])
    test_embeddings_last.append(output['last_layer_embedding'])

test_embeddings_first = np.array(test_embeddings_first)
test_embeddings_middle = np.array(test_embeddings_middle)
test_embeddings_last = np.array(test_embeddings_last)

Generating embeddings for testing data: 100%|██████████| 20/20 [01:16<00:00,  3.80s/it]


In [78]:
# cl_model_first = LogisticRegression(max_iter=1000)
# cl_model_middle = LogisticRegression(max_iter=1000)
# cl_model_last = LogisticRegression(max_iter=1000)

cl_model_first = RandomForestClassifier()
cl_model_middle = RandomForestClassifier()
cl_model_last = RandomForestClassifier()

In [86]:
# y_train = y_train.to_numpy(dtype='object')
# y_test = y_test.to_numpy(dtype='object')

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [87]:
cl_model_first.fit(train_embeddings_first, y_train)
cl_model_middle.fit(train_embeddings_middle, y_train)
cl_model_last.fit(train_embeddings_last, y_train)

In [88]:
y_pred_train_first = cl_model_first.predict(train_embeddings_first)
y_pred_train_middle = cl_model_middle.predict(train_embeddings_middle)
y_pred_train_last = cl_model_last.predict(train_embeddings_last)

y_train, y_pred_train_first, y_pred_train_middle, y_pred_train_last

(array(['Jean-Pierre Melville', 'Hirokazu Koreeda', 'Martin Scorsese',
        'Marcel Carné', 'Julien Duvivier', 'Laura Citarella',
        'Noah Baumbach', 'Haile Gerima', 'Tom McCarthy', 'Andrew Haigh',
        'Damien Chazelle', 'Gillo Pontecorvo', 'Terrence Malick',
        'Michael Haneke', 'Fritz Lang', 'Alfonso Cuarón',
        'Terrence Malick', 'Isao Takahata', 'Yasujirô Ozu',
        'Céline Sciamma', 'Christopher Nolan', 'Krzysztof Kieslowski',
        'Jennifer Peedom', 'Barbet Schroeder', 'Charlotte Wells',
        'Peter Jackson', 'Ethan Coen,', 'Luchino Visconti', 'Bong Joon Ho',
        'George Lucas', 'Spike Lee', 'Charles Burnett', 'Laura Citarella',
        'Steve McQueen', 'Greta Gerwig', 'Kathryn Bigelow',
        'Kathryn Bigelow', 'Nicolas Roeg', 'Kar-Wai Wong',
        'Luca Guadagnino', 'Radu Jude', 'Krzysztof Kieslowski', 'JR,',
        'David Lean', 'Paul Thomas Anderson', 'Alfonso Cuarón',
        'Brad Bird,', 'Steven Spielberg', 'Olivier Assayas', 'Ang Le

In [89]:
print(f'First Layer Accuracy: {accuracy_score(y_train, y_pred_train_first)}')
print(f'Middle Layer Accuracy: {accuracy_score(y_train, y_pred_train_middle)}')
print(f'Last Layer Accuracy: {accuracy_score(y_train, y_pred_train_last)}')

print(f'First Layer F1 Score: {f1_score(y_train, y_pred_train_first, average="weighted")}')
print(f'Middle Layer F1 Score: {f1_score(y_train, y_pred_train_middle, average="weighted")}')
print(f'Last Layer F1 Score: {f1_score(y_train, y_pred_train_last, average="weighted")}')

First Layer Accuracy: 0.45
Middle Layer Accuracy: 1.0
Last Layer Accuracy: 1.0
First Layer F1 Score: 0.3620436507936508
Middle Layer F1 Score: 1.0
Last Layer F1 Score: 1.0


In [90]:
y_pred_test_first = cl_model_first.predict(test_embeddings_first)
y_pred_test_middle = cl_model_middle.predict(test_embeddings_middle)
y_pred_test_last = cl_model_last.predict(test_embeddings_last)

y_test, y_pred_test_first, y_pred_test_middle, y_pred_test_last

(array(['Guillermo del Toro', 'David Fincher', 'John Lasseter',
        'Alexander Nanau', 'Michael Apted', 'Jasmila Zbanic', 'Raoul Peck',
        'Thien An Pham', 'Chantal Akerman', 'Luis Buñuel', 'Robert Altman',
        'Francis Ford Coppola', 'Peter Bogdanovich', 'Jim Sheridan',
        'Alexander Payne', 'Kenneth Lonergan', 'Jafar Panahi',
        'Aleksey German', 'Benny Safdie,', 'Nina Paley'], dtype=object),
 array(['Hayao Miyazaki', 'François Truffaut', 'Richard Linklater',
        'Richard Linklater', 'Claire Simon', 'Mike Leigh', 'Fritz Lang',
        'Cristian Mungiu', 'Marcel Carné', 'Cristian Mungiu', 'Mike Leigh',
        'Kathryn Bigelow', 'Bernardo Bertolucci', 'Kathryn Bigelow',
        'Richard Linklater', 'Steve McQueen', 'Richard Linklater',
        'Fritz Lang', 'Kathryn Bigelow', 'Marcel Carné'], dtype=object),
 array(['Krzysztof Kieslowski', 'Kathryn Bigelow', 'Terrence Malick',
        'JR,', 'Luca Guadagnino', 'Ingmar Bergman', 'Ethan Coen,',
        'Laura C

In [83]:
print(f'First Layer Accuracy: {accuracy_score(y_test, y_pred_test_first)}')
print(f'Middle Layer Accuracy: {accuracy_score(y_test, y_pred_test_middle)}')
print(f'Last Layer Accuracy: {accuracy_score(y_test, y_pred_test_last)}')

print(f'First Layer F1 Score: {f1_score(y_test, y_pred_test_first, average="weighted")}')
print(f'Middle Layer F1 Score: {f1_score(y_test, y_pred_test_middle, average="weighted")}')
print(f'Last Layer F1 Score: {f1_score(y_test, y_pred_test_last, average="weighted")}')

First Layer Accuracy: 0.0
Middle Layer Accuracy: 0.0
Last Layer Accuracy: 0.0
First Layer F1 Score: 0.0
Middle Layer F1 Score: 0.0
Last Layer F1 Score: 0.0
