In [1]:
!pip install pandas numpy scikit-learn




In [8]:
import pandas as pd

# Sample dataset of job descriptions
data = {
    'job_id': [1, 2, 3, 4, 5],
    'job_description': [
        'Software engineer with experience in C++ and Blockchain.',
        'Data scientist skilled in statistics and data visualization.',
        'Frontend developer experienced with React and JavaScript.',
        'Backend developer with knowledge of databases and server management.',
        'Full stack developer with experience in both frontend and backend technologies.'
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,job_id,job_description
0,1,Software engineer with experience in C++ and B...
1,2,Data scientist skilled in statistics and data ...
2,3,Frontend developer experienced with React and ...
3,4,Backend developer with knowledge of databases ...
4,5,Full stack developer with experience in both f...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['job_description'])

# Display the TF-IDF matrix
tfidf_matrix.toarray()

array([[0.22111946, 0.        , 0.46404402, 0.        , 0.        ,
        0.        , 0.        , 0.46404402, 0.37438779, 0.        ,
        0.        , 0.        , 0.31077569, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46404402, 0.        , 0.        , 0.        ,
        0.        , 0.2614343 ],
       [0.16177774, 0.        , 0.        , 0.        , 0.67901749,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.22737296, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33950874, 0.        ,
        0.33950874, 0.        , 0.        , 0.33950874, 0.        ,
        0.33950874, 0.        ],
       [0.22111946, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.31077569, 0.        , 0.        , 0.46404402,
        0.37438779, 0.        , 0.        , 0.46404402, 0.        ,
        0.        , 0.        , 0.46404402, 0.    

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.10643419, 0.11724171, 0.09801936, 0.28053341],
       [0.10643419, 1.        , 0.03577221, 0.02990718, 0.08434813],
       [0.11724171, 0.03577221, 1.        , 0.17876587, 0.28053341],
       [0.09801936, 0.02990718, 0.17876587, 1.        , 0.23453858],
       [0.28053341, 0.08434813, 0.28053341, 0.23453858, 1.        ]])

In [11]:
# Create a DataFrame for better visualization
cosine_sim_df = pd.DataFrame(cosine_sim, index=df['job_id'], columns=df['job_id'])
cosine_sim_df

job_id,1,2,3,4,5
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.106434,0.117242,0.098019,0.280533
2,0.106434,1.0,0.035772,0.029907,0.084348
3,0.117242,0.035772,1.0,0.178766,0.280533
4,0.098019,0.029907,0.178766,1.0,0.234539
5,0.280533,0.084348,0.280533,0.234539,1.0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a Count Vectorizer (Bag-of-Words model)
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(df['job_description'])

# Calculate cosine similarity for Bag-of-Words
cosine_sim_bow = cosine_similarity(count_matrix)

# Create a DataFrame for Bag-of-Words cosine similarity
cosine_sim_bow_df = pd.DataFrame(cosine_sim_bow, index=df['job_id'], columns=df['job_id'])
cosine_sim_bow_df

job_id,1,2,3,4,5
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.0,0.239046,0.285714,0.251976,0.455842
2,0.239046,1.0,0.119523,0.105409,0.190693
3,0.285714,0.119523,1.0,0.377964,0.455842
4,0.251976,0.105409,0.377964,1.0,0.402015
5,0.455842,0.190693,0.455842,0.402015,1.0


In [14]:
# Display both cosine similarity results side by side
comparison_df = pd.concat([cosine_sim_df, cosine_sim_bow_df], axis=1, keys=['TF-IDF', 'Bag-of-Words'])
comparison_df

Unnamed: 0_level_0,TF-IDF,TF-IDF,TF-IDF,TF-IDF,TF-IDF,Bag-of-Words,Bag-of-Words,Bag-of-Words,Bag-of-Words,Bag-of-Words
job_id,1,2,3,4,5,1,2,3,4,5
job_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,1.0,0.106434,0.117242,0.098019,0.280533,1.0,0.239046,0.285714,0.251976,0.455842
2,0.106434,1.0,0.035772,0.029907,0.084348,0.239046,1.0,0.119523,0.105409,0.190693
3,0.117242,0.035772,1.0,0.178766,0.280533,0.285714,0.119523,1.0,0.377964,0.455842
4,0.098019,0.029907,0.178766,1.0,0.234539,0.251976,0.105409,0.377964,1.0,0.402015
5,0.280533,0.084348,0.280533,0.234539,1.0,0.455842,0.190693,0.455842,0.402015,1.0
