# Sheet 2

In [1]:
import os
from datasets import load_dataset
import pandas as pd

# Load it once to get all available splits
huggingface_dataset = load_dataset("pixelpandacreative/dazzle-0001")

# Now get a list of all available keys (split names)
dataset_keys = list(huggingface_dataset.keys())

dataset = {}
for key in dataset_keys:
    parquet_file = f'/data/notebook_files/{key}.parquet'
    
    if os.path.exists(parquet_file):
        # load the parquet file if it already exists
        dataset[key] = pd.read_parquet(parquet_file)
    else:
        # download the dataset and save as a parquet file if it does not exist
        df = pd.DataFrame(huggingface_dataset[key])
        df.to_parquet(parquet_file)
        dataset[key] = df

print(dataset)

{'train':             public_identifier  \
0                        none   
1                        a-a-   
2                  a-a-bailey   
3                 a-a-cabrera   
4      a-a-de-oliveira-manuel   
...                       ...   
19995        aaron-dominowski   
19996            aaron-dominy   
19997      aaron-don-williams   
19998              aaron-dona   
19999            aaron-donald   

                                         profile_pic_url  \
0      https://static-exp1.licdn.com/sc/h/244xhbkr7g4...   
1      https://media-exp1.licdn.com/dms/image/C4E03AQ...   
2      https://media-exp1.licdn.com/dms/image/C4E03AQ...   
3      https://media-exp1.licdn.com/dms/image/C4E03AQ...   
4      https://media-exp1.licdn.com/dms/image/C4E03AQ...   
...                                                  ...   
19995  https://static-exp1.licdn.com/sc/h/244xhbkr7g4...   
19996  https://media-exp1.licdn.com/dms/image/C4E03AQ...   
19997  https://static-exp1.licdn.com/sc/h/244xhbkr7g4

In [2]:
from pandas import json_normalize

all_fields = pd.DataFrame()

# Print the full structure including all nested columns from ‘dataset’
for key in dataset.keys():
    print(f'--- Structure for {key} ---')
    dataset[key].info(verbose=True)
    
    dicts_df = dataset[key].applymap(lambda x: x if isinstance(x, dict) else {str(x): x})
    
    # Flatten the nested columns and store in a temporary DataFrame
    flattened_df = pd.json_normalize(dicts_df)
    
    # Append the flattened DataFrame to `all_fields`
    all_fields = all_fields.append(flattened_df)

all_fields.to_parquet('/data/notebook_files/all_fields.parquet')

--- Structure for train ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   public_identifier             20000 non-null  object 
 1   profile_pic_url               20000 non-null  object 
 2   background_cover_image_url    20000 non-null  object 
 3   first_name                    20000 non-null  object 
 4   last_name                     19998 non-null  object 
 5   full_name                     20000 non-null  object 
 6   occupation                    17878 non-null  object 
 7   headline                      18794 non-null  object 
 8   summary                       12592 non-null  object 
 9   country                       20000 non-null  object 
 10  country_full_name             20000 non-null  object 
 11  city                          19326 non-null  object 
 12  state                         13

  all_fields = all_fields.append(flattened_df)


In [3]:
from collections import defaultdict

# Fix indents 
import pyarrow.parquet as pq
import ast

# Specify the Parquet dataset location
file_path = '/data/notebook_files/train.parquet'

# Create a ParquetDataset Press command achy instance
dataset = pq.ParquetDataset(file_path)

# Initiate an empty dictionary for flattened records
flattened_records = defaultdict(list)

# Use per-row logic to flatten the experiences column
for row in dataset.read().to_pandas().itertuples(index=False):
    experiences = ast.literal_eval(row.experiences) if type(row.experiences) == str else row.experiences

if experiences.any():  # Check if the experiences list is not empty
 for exp in experiences:
            # Copy non-experience fields
  for key, value in row._asdict().items():
    if key != 'experiences':
     flattened_records[key].append(value)

    # Flatten experience fields
    for exp_key, exp_val in exp.items():
            flattened_records[f'experiences_{exp_key}'].append(exp_val)

                    # Handle missing 'experiences_ends_at'
            if 'ends_at' not in exp:
                        flattened_records['experiences_ends_at'].append('present')
            else:  # If the experiences list is empty, still copy the non-experience fields
              for key, value in row._asdict().items():
                    if key != 'experiences':
                        flattened_records[key].append(value)
                    # Add placeholders for experience fields
                    flattened_records['experiences_company'].append(None)
                    flattened_records['experiences_title'].append(None)
                    flattened_records['experiences_starts_at'].append(None)
                    flattened_records['experiences_ends_at'].append(None)

# Write the dictionary directly to Parquet using PyArr

In [4]:
from transformers import pipeline
import pandas as pd
import numpy as np

# Specify the Parquet dataset location
file_path = '/data/notebook_files/train.parquet'

# Load the dataset into a DataFrame 
df = pd.read_parquet(file_path)

# Display column names to verify if "occupation" exists
print(df.columns)

# Get the top 1000 job titles based on their occurrence
top_job_titles = df['occupation'].value_counts().nlargest(1000).index.to_list()

# Initialize the feature extraction pipeline
feature_extractor = pipeline('feature-extraction', model='distilbert-base-uncased')

# Create an empty matrix to hold the embeddings
embeddings = np.empty((0, 768), float)

# Iterate over the top 1000 job titles, performing the feature extraction on each
for job_title in top_job_titles:
    embedding = feature_extractor(job_title)
    embeddings = np.append(embeddings, embedding[0], axis=0)  # Expecting embedding[0] to be a 1D vector

# Print the embeddings
print(embeddings)

Index(['public_identifier', 'profile_pic_url', 'background_cover_image_url',
       'first_name', 'last_name', 'full_name', 'occupation', 'headline',
       'summary', 'country', 'country_full_name', 'city', 'state',
       'experiences', 'education', 'languages', 'accomplishment_organisations',
       'accomplishment_publications', 'accomplishment_honors_awards',
       'accomplishment_patents', 'accomplishment_courses',
       'accomplishment_projects', 'accomplishment_test_scores',
       'volunteer_work', 'certifications', 'connections', 'people_also_viewed',
       'recommendations', 'activities', 'similarly_named_profiles', 'articles',
       'groups', 'skills', 'inferred_salary', 'github', 'facebook', 'gender',
       'birth_date', 'industry', 'interests'],
      dtype='object')
[[-0.26664466 -0.08274557 -0.32174781 ... -0.17905299  0.24714933
   0.2419768 ]
 [-0.24033551  0.10329705 -0.039351   ... -0.41274789 -0.08833237
  -0.06715534]
 [-0.20224419  0.10520907 -0.22203828 ...

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Convert numpy array into a DataFrame
embeddings_df = pd.DataFrame(embeddings)

# Save the DataFrame into a Parquet file
embeddings_df.to_parquet('/data/notebook_files/embeddings.parquet', index=False)

ValueError: ValueError: parquet must have string column names