# Use Case 2: Structuring Unstructured Data

Code authored by: Shaw Talebi

Video link: https://youtu.be/3JsgtpX_rpU <br>
Blog link: https://towardsdatascience.com/3-ai-use-cases-that-are-not-a-chatbot-f4f328a2707a

### imports

In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


### load data

In [2]:
df = pl.read_csv('data/resumes_augmented.csv')

### prep data

In [3]:
# remove trailing whitespace
df = df.with_columns((pl.col("Resume").str.strip_chars()).alias("Resume"))
# remove 1st line with name
df = df.with_columns((pl.col("Resume").str.replace(r'^[^\n]*\n', '')).alias("Resume"))

In [4]:
# replace name with number for privacy
df = df.with_columns(pl.Series(name="Name", values=list(range(len(df)))))

### Genereating Embeddings

In [5]:
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name)



In [6]:
# generate embeddings
embedding_arr = model.encode(df["Resume"].to_list())

# store embeddings in a dataframe
schema_dict = {'Embedding-'+str(i): float for i in range(embedding_arr.shape[1])}
df_embeddings = pl.DataFrame(embedding_arr, schema=schema_dict)

# append embeddings to resumes df
df = pl.concat([df, df_embeddings], how='horizontal')

In [7]:
df.head()

Name,Resume,exp_level,Embedding-0,Embedding-1,Embedding-2,Embedding-3,Embedding-4,Embedding-5,Embedding-6,Embedding-7,Embedding-8,Embedding-9,Embedding-10,Embedding-11,Embedding-12,Embedding-13,Embedding-14,Embedding-15,Embedding-16,Embedding-17,Embedding-18,Embedding-19,Embedding-20,Embedding-21,Embedding-22,Embedding-23,Embedding-24,Embedding-25,Embedding-26,Embedding-27,Embedding-28,Embedding-29,Embedding-30,Embedding-31,Embedding-32,Embedding-33,…,Embedding-731,Embedding-732,Embedding-733,Embedding-734,Embedding-735,Embedding-736,Embedding-737,Embedding-738,Embedding-739,Embedding-740,Embedding-741,Embedding-742,Embedding-743,Embedding-744,Embedding-745,Embedding-746,Embedding-747,Embedding-748,Embedding-749,Embedding-750,Embedding-751,Embedding-752,Embedding-753,Embedding-754,Embedding-755,Embedding-756,Embedding-757,Embedding-758,Embedding-759,Embedding-760,Embedding-761,Embedding-762,Embedding-763,Embedding-764,Embedding-765,Embedding-766,Embedding-767
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,"""Data & Analytics Leader Lubboc…",5,-0.145309,0.297774,-0.000305,0.012916,0.090752,-0.015084,-0.113963,-0.051909,0.000356,0.036491,0.084914,-0.024203,-0.00889,0.143781,-0.122697,-0.050741,-0.071565,-0.014304,0.002142,0.077232,0.072558,-0.084106,0.068583,-0.126012,0.00711,-0.022222,-0.093983,0.078837,0.078672,0.025793,0.028212,0.058342,0.004703,0.045366,…,-0.060133,0.340738,0.091177,-0.085834,-0.020368,0.203635,-0.115464,-0.053299,-0.0133,-0.005066,0.069525,0.151514,-0.068514,-0.015893,-0.071308,0.048105,0.009991,0.127171,0.006124,-0.027106,-0.08621,-0.109924,0.083972,-0.075978,0.039498,-0.096002,-0.050898,0.155162,0.09774,-0.004555,0.078997,-0.038528,0.100873,0.023524,0.102765,-0.057972,-0.032112
1,"""AI/ML Data Scientist | Passion…",5,-0.18058,0.266733,0.003029,-0.019758,0.139987,-0.014073,0.067242,-0.027488,0.192167,0.046374,0.017618,0.148419,0.053368,0.253952,-0.074314,-0.063605,-0.026669,0.10324,0.102313,0.042091,0.129558,-0.079425,0.072485,-0.038063,-0.126314,-0.104152,-0.037971,0.000461,0.025357,0.026016,0.065926,-0.104646,0.029816,0.050765,…,-0.046485,0.091016,0.090034,0.103039,-0.055804,-0.126557,-0.012091,-0.068539,-0.031265,-0.063245,0.110759,0.031252,-0.020652,-0.04162,-0.127778,0.047364,-0.111438,-0.070242,0.003464,-0.17552,-0.020314,-0.092564,0.13779,-0.004003,0.004403,-0.207595,-0.166176,0.032812,-0.020121,0.128239,-0.019487,-0.09244,-0.075839,0.041534,-0.013134,0.013094,-0.127653
2,"""AI | Machine Learning | Data S…",5,-0.134793,0.190827,0.003539,-0.029401,0.164353,0.004289,0.19708,-0.130383,0.17565,0.02623,0.061826,-0.061218,0.023536,0.354023,0.052247,-0.059917,-0.013633,-0.040548,-0.00207,0.048283,0.10905,-0.089975,0.044013,0.015057,-0.085836,-0.077551,-0.034109,-0.008148,0.061832,-0.010138,0.073756,-0.076712,0.106451,-0.058882,…,0.013235,0.164419,0.11071,0.103071,0.079208,-0.275036,-0.023325,-0.073866,0.000583,0.039853,0.119304,-0.055936,-0.029225,-0.050778,-0.051295,0.023356,-0.018853,0.136633,-0.016791,-0.170485,-0.006889,-0.173121,0.042028,-0.100252,-0.025195,-0.151706,-0.048992,0.026046,0.026691,0.165579,0.056522,-0.089286,0.004786,0.064061,0.031188,-0.065916,-0.060417
3,"""Staff Data Scientist | Gen-AI …",5,-0.092479,0.214494,0.001039,-0.074435,0.135741,0.006723,0.083083,-0.050211,0.215038,-0.008108,0.020558,0.087591,-0.065646,0.222953,-0.051499,0.017689,-0.093303,0.073354,-0.180645,0.062729,0.058208,-0.044257,0.084877,-0.030756,-0.096932,-0.108332,-0.012627,0.091054,0.099713,0.052615,0.010536,-0.079987,-0.044301,0.06629,…,-0.038721,0.19331,0.072767,0.216869,-0.046042,-0.146964,-0.110477,-0.042867,-0.041797,-0.021769,0.111559,0.059306,-0.084834,-0.020393,-0.141657,0.104711,0.010105,0.032513,-0.00887,-0.105544,0.039329,-0.05539,0.085662,0.050851,-0.048142,-0.166495,-0.058678,0.017174,0.042527,0.115724,0.028265,-0.070562,0.07337,0.055735,0.042163,0.043192,-0.071806
4,"""Principal Data Scientist @ Wal…",4,-0.103386,0.243209,0.006515,-0.041919,0.180505,0.02335,0.061281,-0.046345,0.181609,-0.028613,0.091647,-0.024719,-0.01575,0.274301,-0.059843,0.01471,0.013136,0.014584,-0.04748,0.065718,0.043272,-0.041704,0.067709,-0.054167,-0.112671,-0.115934,-0.076242,-0.016606,0.054981,0.02602,0.076148,-0.089251,0.07152,0.056113,…,-0.012516,0.060239,0.08834,-0.097451,-0.070028,-0.006287,0.014427,-0.055792,0.025938,-0.015007,0.122045,0.121847,-0.06916,-0.052547,-0.014747,0.147139,-0.072065,-0.110122,0.023633,-0.183421,0.026936,-0.124461,-0.005087,0.004262,0.040126,-0.171985,-0.032532,0.148718,0.155715,0.002663,-0.053827,-0.04091,0.009788,0.033336,0.068966,-0.017775,-0.111932


In [8]:
# write data to file
df.write_csv('data/resumes_augemented_structured.csv')