In [1]:
import pandas as pd
import numpy as np
import requests
import pickle
import torch
import math
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path_lawyer = './data/lsp_profiles.csv'
data_path_user = './data/user_profiles.csv'

In [3]:
df_lsp = pd.read_csv(data_path_lawyer)
df_lsp_transf = df_lsp.copy(deep=False)
df_lsp

Unnamed: 0,name,area_of_expertise,location,availability,years_of_experience,languages_spoken,fee_structure
0,Natalie Riley,Corporate Law,Hyderabad,Part-time,17,Hindi,9528
1,Anthony Dougherty,Corporate Law,Nagpur,Part-time,27,Urdu,39983
2,Derek Odom,Corporate Law,Thane,Full-time,3,Konkani,35628
3,Allen King,Corporate Law,Mumbai,Full-time,13,Sanskrit,3030
4,Sandra Mitchell,Corporate Law,Thane,Part-time,6,Tamil,21786
...,...,...,...,...,...,...,...
95,Daniel Watts Jr.,Intellectual Property,Chennai,Full-time,8,Kannada,45509
96,Christopher Woods,Intellectual Property,Delhi,Part-time,27,Punjabi,27766
97,Jose Wilson,Corporate Law,Delhi,Full-time,28,Sanskrit,13117
98,Henry Myers,Criminal Law,Pune,Full-time,1,Marathi,41642


In [4]:
df_user = pd.read_csv(data_path_user)
df_user_transf = df_user.copy(deep=False)
df_user

Unnamed: 0,name,legal_needs,location,availability,experience_level,preferred_language,budget_constraints
0,Laura Bryan,Divorce,Lucknow,Part-time,1.0,Punjabi,32870
1,Kaitlyn Hall,Criminal Defense,Firozabad,Full-time,0.5,Malayalam,36239
2,Robert Greene,Divorce,Indore,Part-time,0.0,Hindi,21363
3,Julian Dorsey,Divorce,Kanpur,Part-time,0.5,Gujarati,25662
4,Wesley Joseph,Divorce,Thane,Part-time,0.5,English,36906
...,...,...,...,...,...,...,...
195,Laura Steele,Contract Review,Delhi,Full-time,0.0,Tamil,45691
196,James Johnson,Contract Review,Mumbai,Full-time,0.0,Kannada,26330
197,Patrick Powers,Divorce,Bangalore,Part-time,0.0,Kashmiri,16894
198,Diana Lopez,Contract Review,Chennai,Full-time,0.0,Bengali,13278


In [5]:
def get_lat_long(location):
    # Define the URL for the Nominatim API request with your string address
    # address = "Bangalore"
    encoded_address = requests.utils.quote(location)  # URL encode the address
    url = f"https://nominatim.openstreetmap.org/search?q={encoded_address}&format=json"

    # Make the GET request
    response = requests.get(url)

    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        data = response.json()
        # Process the response data to extract latitude and longitude
        if len(data) > 0:
            lat = data[0]['lat']
            lon = data[0]['lon']
            return float(lat), float(lon)
        else:
            print("Location not found.")
            return None
    else:
        print("Error: Unable to retrieve data from Nominatim API.")
        return None

In [6]:
# # Uncomment to precompute
# lat_long_mapping = {}

# for city in set(df_lsp['location']):
#     lat_long_mapping[city] = get_lat_long(city)
    
# for city in set(df_user['location']):
#     lat_long_mapping[city] = get_lat_long(city)
    
# with open('lat_long_mapping.pkl', 'wb') as fp:
#     pickle.dump(lat_long_mapping, fp)
#     print('dictionary saved successfully to file')

In [7]:
# Read dictionary pkl file
with open('lat_long_mapping.pkl', 'rb') as fp:
    lat_long_mapping = pickle.load(fp)

In [8]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode the area of expertise of the lawyer 
# Also encode the legal needs of the user
df_lsp_transf['area_of_expertise'] = df_lsp_transf['area_of_expertise'].apply(lambda phrase: 
                                                                model.encode(phrase, convert_to_tensor=True)
                                                               )
df_user_transf['legal_needs'] = df_user_transf['legal_needs'].apply(lambda phrase: 
                                                      model.encode(phrase, convert_to_tensor=True))

# Encode the years of experience of a lawyer into the vector
min_years = df_lsp_transf['years_of_experience'].min()
max_years = df_lsp_transf['years_of_experience'].max()
df_lsp_transf['years_of_experience'] = (df_lsp_transf['years_of_experience'] - min_years) / (max_years - min_years)

# Encode the availability of the lawyer into the vector
df_lsp_transf['availability'] = df_lsp_transf['availability'].apply(lambda job_type: 
                                                      1 if job_type == "Full-time" else 0)
df_user_transf['availability'] = df_user_transf['availability'].apply(lambda job_type: 
                                                        1 if job_type == "Full-time" else 0)

# Encode the languages spoken and preferred language into a vector
df_lsp_transf['languages_spoken'] = df_lsp_transf['languages_spoken'].apply(lambda phrase: 
                                                              model.encode(phrase, convert_to_tensor=True)
                                                             )
df_user_transf['preferred_language'] = df_user_transf['preferred_language'].apply(lambda phrase: 
                                                                    model.encode(phrase, convert_to_tensor=True)
                                                                   )

# Encode the location into latitude and longitude
df_lsp_transf['location'] = df_lsp_transf['location'].apply(lambda location: 
                                              lat_long_mapping[location])
df_user_transf['location'] = df_user_transf['location'].apply(lambda location: 
                                                lat_long_mapping[location])


In [9]:
phrase1 = "Divorce"
phrase2 = 'Criminal law'

embedding1 = model.encode(phrase1, convert_to_tensor=True)
embedding2 = model.encode(phrase2, convert_to_tensor=True)
print(util.pytorch_cos_sim(embedding1, embedding2).item())

0.2004368007183075


In [15]:
user_id = 1
user = df_user_transf.iloc[user_id]
similarity_dict = {}

# Ranking criteria for features of the model
# 1. area of expertise and location 
# 2. fee structure and years of experience
# 3. language spoken
# 4. Availability
weights = {
    "name": 0, 
    "area_of_expertise": 6, 
    "location": 4, 
    "availability": 1, 
    "years_of_experience": 3, 
    "languages_spoken": 2, 
    "fee_structure": 3
}

for index in range(len(df_lsp_transf)):
    similarity = 0
    
    for (lsp_col, user_col) in zip(df_lsp.columns, df_user.columns):
        
        lsp_feature = df_lsp_transf.iloc[index][lsp_col]
        user_feature = user[user_col]
        
        if lsp_col == 'name':
            continue
        
        if isinstance(lsp_feature, tuple):
            lsp_feature = torch.tensor(lsp_feature)
            
        if isinstance(user_feature, tuple):
            user_feature = torch.tensor(user_feature)
        
        if isinstance(lsp_feature, (np.int64, np.float64, int)):
            # Calculate distance
            distance = (lsp_feature - user_feature)
            
            # map it to range (-1, 1) with sigmoid
            mapped_distance = math.tanh(distance)
            similarity += weights[lsp_col] * mapped_distance
        
        else:
            similarity += weights[lsp_col] * util.pytorch_cos_sim(lsp_feature, user_feature).item()
    
    similarity_dict[index] = similarity

In [16]:
print("User:")
print(df_user.iloc[user_id])

sorted_dict_desc = dict(sorted(similarity_dict.items(), key=lambda item: item[1], reverse=True))

print("\nLawyers Recommended:")
for key in sorted_dict_desc.keys():
    print('----------------')
    print(df_lsp.iloc[key])

User:
name                      Kaitlyn Hall
legal_needs           Criminal Defense
location                     Firozabad
availability                 Full-time
experience_level                   0.5
preferred_language           Malayalam
budget_constraints               36239
Name: 1, dtype: object

Lawyers Recommended:
----------------
name                    Lisa Snyder
area_of_expertise      Criminal Law
location                  Firozabad
availability              Full-time
years_of_experience               7
languages_spoken          Malayalam
fee_structure                 40843
Name: 40, dtype: object
----------------
name                   Mr. Joe Green
area_of_expertise       Criminal Law
location                   Ahmedabad
availability               Full-time
years_of_experience               12
languages_spoken             Konkani
fee_structure                  39246
Name: 23, dtype: object
----------------
name                    Evan Rivera
area_of_expertise      Crimina