In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import pickle

In [2]:
# PostgreSQL connection details
db_url = 'postgresql://terrox:1205@localhost:5432/majduri'
engine = create_engine(db_url)

In [3]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fdebfeef170>

In [4]:
df = pd.read_sql("SELECT * FROM table_1", engine)

In [5]:
df.head()

Unnamed: 0,skills,salary_range,qualifications,experience,job_title,job_id,location,latitude,longitude,work_type,job_posting_date,preference,contact_person,contact,combined
0,warehouse_management crane_operation loading/u...,"₹12,000 - ₹30,000",NILL or 10th,4_to_11_years,dock_worker,1089840000000000,osmanabad,15.6066,74.4717,Part-Time,24-04-2022,female,Vihaan Kumar,+91-6338887517,warehouse_management crane_operation loading/u...
1,order_taking cash_handling multitasking menu_k...,"₹8,000 - ₹20,000",NILL or 10th,2_to_7_years,waiter,398454000000000,latur,18.1643,72.8138,Part-Time,19-12-2022,female,Vihaan Mehta,+91-8495848693,order_taking cash_handling multitasking menu_k...
2,farming_techniques irrigation basic_machinery_...,"₹10,000 - ₹25,000",NILL or 10th,0_to_9_years,agricultural_worker,481640000000000,nandurbar,20.0181,78.7996,Full-Time,14-09-2022,male,Priya Reddy,+91-9800897411,farming_techniques irrigation basic_machinery_...
3,safety_practices electrical_systems welding ve...,"₹15,000 - ₹30,000",ITI or 10th,9_to_10_years,mechanic,688193000000000,akola,16.5614,78.2276,Full-Time,25-02-2023,female,Neha Gupta,+91-7104739198,safety_practices electrical_systems welding ve...
4,teamwork cooking_techniques time_management fo...,"₹12,000 - ₹25,000",NILL or 10th,4_to_6_years,cook,117058000000000,wardha,21.812,73.2671,Part-Time,11-10-2022,female,Siddharth Gupta,+91-9723674124,teamwork cooking_techniques time_management fo...


# VECTORIZER

In [67]:
vectorizer = TfidfVectorizer(stop_words='english')

weights = {"skills" : 5,
           "location" : 1.3,
           "experience" : 3,
           "preference" : 2.5,
           }

# assigning weight to a column to indicate its priority

In [68]:
df["combined"] = df["skills"] + " " + df["experience"] + " " + df['location'] + " " + df["preference"]

df.iloc[0].iloc[-1]

# combined string in the order -> skills location experience preference (for vectorization)

'warehouse_management crane_operation loading/unloading 4_to_11_years osmanabad female'

In [69]:
tfidf_matrix = vectorizer.fit_transform(df["combined"])

# creating a vectorized matrix of size of the dataframe (size of df, total features)

In [70]:
tfidf_matrix.shape

(9999, 212)

In [71]:
feature_names = vectorizer.get_feature_names_out()

len(feature_names)

212

In [72]:
weight_vector = np.ones(len(feature_names))

# creating an array of 1s of the size of feature_names

In [73]:
skills_words = " ".join(df["skills"].dropna()).split()
experience_words = " ".join(df["experience"].dropna().astype(str)).split()
location_words = " ".join(df["location"].dropna()).split()
preference_words = " ".join(df["preference"].dropna()).split()

# creating a map of a word and its category
word_category_map = {}

for word in skills_words:
    word_category_map[word] = "skills"

for word in experience_words:
    word_category_map[word] = "experience"

for word in location_words:
    word_category_map[word] = "location"

for word in preference_words:
    word_category_map[word] = "preference"
    
# 
for i, term in enumerate(feature_names):
    category = word_category_map.get(term)  # Get the category of the term
    if category:
        weight_vector[i] *= weights[category] # apply the weight


"""
kinda have to explain this one, no?
here, size of ifidf_matrix is (9999, 193) ie 10000 rows 193 columns, and weight_matrix (193, 1)

dataset has 10000 rows, and our feature_names size is 193, thus we can multiply them (vector product)
"""
tfidf_matrix = tfidf_matrix.multiply(weight_vector)

tfidf_matrix

<COOrdinate sparse matrix of dtype 'float64'
	with 70860 stored elements and shape (9999, 212)>

In [74]:
def location_similarity(loc1, loc2):
    return fuzz.ratio(loc1, loc2) / 100 # value between 0-1

In [75]:
def recommend_job(skills, experience, location, preference, top_n=30):
    query = "SELECT * FROM table_1"
    df = pd.read_sql(query, engine)

    combined_str = ' '.join(skills) + " " + experience + " " + location + " " + preference
    user_vector = vectorizer.transform([combined_str])
    
    tfidf_similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()

    location_scores = df["location"].apply(lambda job_loc: location_similarity(job_loc, location))

    refined_scores = tfidf_similarities * location_scores

    top_indices = refined_scores.argsort()[-top_n:][::-1]

    return df.iloc[top_indices][["job_title", "skills", "salary_range", "location", "preference", "experience"]]

In [76]:
recommend_job(['cleaning', 'laundry', 'cooking'], "3_to_4_years", "mumbai", "female")

Unnamed: 0,job_title,skills,salary_range,location,preference,experience
9698,maid,cleaning laundry organization,"₹8,000 - ₹25,000",mumbai,female,5_to_10_years
1102,maid,laundry time_management cleaning,"₹8,000 - ₹25,000",mumbai,both,8_to_10_years
3920,maid,time_management organization attention_to_detail,"₹8,000 - ₹25,000",mumbai,both,3_to_4_years
5429,maid,housekeeping cleaning organization laundry tim...,"₹8,000 - ₹25,000",mumbai,female,9_to_11_years
1606,maid,time_management attention_to_detail laundry cl...,"₹8,000 - ₹25,000",mumbai,male,6_to_10_years
5207,maid,time_management laundry attention_to_detail cl...,"₹8,000 - ₹25,000",mumbai,male,4_to_8_years
3168,maid,laundry cleaning housekeeping organization,"₹8,000 - ₹25,000",mumbai,male,7_to_12_years
5744,maid,laundry time_management housekeeping cleaning ...,"₹8,000 - ₹25,000",mumbai,female,3_to_5_years
9319,maid,laundry attention_to_detail housekeeping clean...,"₹8,000 - ₹25,000",mumbai,female,1_to_7_years
6698,maid,laundry time_management organization cleaning ...,"₹8,000 - ₹25,000",mumbai,female,2_to_9_years


In [50]:
def add_job(job_title, skills, salary, location, preference, experience):

    skills_str = ' '.join(skills)
    
    query = text(f"""
    INSERT INTO table_1 (job_title, skills, salary_range, location, preference, experience)
    VALUES ('{job_title}','{skills_str}', '{salary}', '{location}', '{preference}', '{experience}')
    """)
    
    with engine.connect() as conn:
        conn.execute(query, {
            "skills" : skills_str,
            "job_title" : job_title,
            "salary_range" : salary,
            "location" : location,
            "preference" : preference,
            "experience" : experience
        })
        conn.commit()

In [51]:
add_job('maid', ['cleaning', 'laundry', 'time_management', 'housekeeping'],'₹10,000 - ₹35,000', 'thane', 'female', '0_to_1_years')

In [21]:
df.head()

NameError: name 'df' is not defined