### Dataset

In [None]:
import pandas as pd
import random

num_tuples = 100000

user_ids = [random.randint(1000, 9999) for _ in range(num_tuples)]
expert_ids = [random.randint(2000, 2999) for _ in range(num_tuples)]
years_of_exp = [random.randint(1, 30) for _ in range(num_tuples)]
fees_per_hour = [random.randint(100, 500) for _ in range(num_tuples)]
ratings = [round(random.uniform(3.0, 5.0), 1) for _ in range(num_tuples)]
specialization_domain = [random.choice(["Corporate Law", "Intellectual Property", "Family Law", "Criminal Law", "Real Estate Law", "Immigration Law", "Personal Injury Law", "Environmental Law", "Tax Law", "Employment Law", "Contract Law", "Medical Malpractice Law", "Divorce Law", "Estate Planning Law", "Bankruptcy Law", "Personal Bankruptcy Law", "International Law"]) for _ in range(num_tuples)]
locations = [random.choice(["Delhi", "Mumbai", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune", "Ahmedabad", "Jaipur", "Lucknow", "Chandigarh"]) for _ in range(num_tuples)]  # Indian cities
num_documents_reviewed = [random.randint(50, 300) for _ in range(num_tuples)]
average_response_time = [round(random.uniform(1.0, 5.0), 1) for _ in range(num_tuples)]
num_clients_served = [random.randint(10, 500) for _ in range(num_tuples)]
user_domain = [random.choice(["Divorce", "Criminal", "Immigration", "Corporate", "Real Estate", "Family", "Tax", "Employment", "Contract", "Personal Injury"]) for _ in range(num_tuples)]
user_fee_capability = [random.randint(100, 500) for _ in range(num_tuples)]

data = {
    'User_ID': user_ids,
    'Expert_ID': expert_ids,
    'Years_of_Exp': years_of_exp,
    'Fees_per_Hour': fees_per_hour,
    'Rating': ratings,
    'Specialization_Domain': specialization_domain,
    'Location': locations,
    'Num_Documents_Reviewed': num_documents_reviewed,
    'Average_Response_Time': average_response_time,
    'Num_Clients_Served': num_clients_served,
    'User_Domain': user_domain,
    'User_Fee_Capability': user_fee_capability,
}

df = pd.DataFrame(data)

def estimate_percentage_match(row):
    if row['User_Domain'] == row['Specialization_Domain'] and abs(row['Fees_per_Hour'] - row['User_Fee_Capability']) < 50:
        return round(random.uniform(0.7, 1.0), 2)
    else:
        return round(random.uniform(0.0, 0.3), 2)

df['Percentage_Match'] = df.apply(estimate_percentage_match, axis=1)

df.to_csv('legal_data.csv', index=False)


### Model Implementation

In [None]:
import pandas as pd
import random

data = {
    'Lawyer_ID': range(1, 51),
    'Name': [f'Lawyer_{i}' for i in range(1, 51)],
    'Domain': [random.choice(['Criminal', 'Family', 'Business', 'Real Estate']) for _ in range(50)],
    'Fees': [random.randint(1000, 5000) for _ in range(50)],
}

lawyers_df = pd.DataFrame(data)


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

encoder = OneHotEncoder(sparse=False)
domain_encoded = encoder.fit_transform(lawyers_df[['Domain']])

scaler = StandardScaler()
fees_scaled = scaler.fit_transform(lawyers_df[['Fees']])


In [None]:
import numpy as np

features = np.concatenate([domain_encoded, fees_scaled], axis=1)

def calculate_similarity(user_profile, lawyer_profiles):
    return np.dot(lawyer_profiles, user_profile.reshape(-1, 1)).flatten()

user_profile = np.array([1, 0, 0, 1, 3000])

similarity_scores = calculate_similarity(user_profile, features)

top_lawyers_indices = np.argsort(similarity_scores)[::-1][:5]
top_lawyers = lawyers_df.iloc[top_lawyers_indices]

print("Top 5 Recommended Lawyers:")
print(top_lawyers)


Top 5 Recommended Lawyers:
    Lawyer_ID       Name       Domain  Fees
7           8   Lawyer_8     Business  4984
35         36  Lawyer_36  Real Estate  4915
1           2   Lawyer_2  Real Estate  4861
21         22  Lawyer_22     Business  4804
27         28  Lawyer_28     Business  4755


In [None]:
new_data = {
    'Name': ['New_Lawyer_1', 'New_Lawyer_2', 'New_Lawyer_3', 'New_Lawyer_4', 'New_Lawyer_5',
             'New_Lawyer_6', 'New_Lawyer_7', 'New_Lawyer_8', 'New_Lawyer_9', 'New_Lawyer_10'],
    'Domain': ['Criminal', 'Family', 'Business', 'Real Estate', 'Criminal',
               'Family', 'Business', 'Real Estate', 'Criminal', 'Family'],
    'Fees': [2500, 3500, 4500, 4000, 3000, 3200, 2800, 3800, 4200, 3700],
}

new_lawyers_df = pd.DataFrame(new_data)

new_domain_encoded = encoder.transform(new_lawyers_df[['Domain']])
new_fees_scaled = scaler.transform(new_lawyers_df[['Fees']])

new_features = np.concatenate([new_domain_encoded, new_fees_scaled], axis=1)

new_similarity_scores = np.dot(new_features, features.T)
top_5_lawyers_indices = np.argsort(new_similarity_scores, axis=1)[:, -5:]

for i, top_indices in enumerate(top_5_lawyers_indices):
    print(f"Top 5 Recommended Lawyers for USer {i}:")
    print(lawyers_df.iloc[top_indices])
    print()


Top 5 Recommended Lawyers for USer 0:
    Lawyer_ID       Name    Domain  Fees
28         29  Lawyer_29    Family  1073
9          10  Lawyer_10  Criminal  2015
13         14  Lawyer_14  Criminal  1521
20         21  Lawyer_21  Criminal  1398
43         44  Lawyer_44  Criminal  1277

Top 5 Recommended Lawyers for USer 1:
    Lawyer_ID       Name  Domain  Fees
37         38  Lawyer_38  Family  3528
29         30  Lawyer_30  Family  3808
46         47  Lawyer_47  Family  3854
18         19  Lawyer_19  Family  4383
33         34  Lawyer_34  Family  4519

Top 5 Recommended Lawyers for USer 2:
    Lawyer_ID       Name    Domain  Fees
12         13  Lawyer_13  Business  4326
32         33  Lawyer_33  Business  4352
27         28  Lawyer_28  Business  4755
21         22  Lawyer_22  Business  4804
7           8   Lawyer_8  Business  4984

Top 5 Recommended Lawyers for USer 3:
    Lawyer_ID       Name       Domain  Fees
47         48  Lawyer_48  Real Estate  3691
2           3   Lawyer_3  Real 

### Another Method

In [None]:

lawyers_data = [
    {
        "name": "Lawyer A",
        "domain": "Criminal Defense",
        "fee_capability": "High",
        "expertise": "Criminal Law",
        "years_of_exp": 10,
        "rating": 4.8,
        "num_documents_reviewed": 100,
        "average_response_time": 2,
        "num_clients_served": 50,
    },
    {
        "name": "Lawyer B",
        "domain": "Family Law",
        "fee_capability": "Medium",
        "expertise": "Family Law",
        "years_of_exp": 8,
        "rating": 4.9,
        "num_documents_reviewed": 120,
        "average_response_time": 1,
        "num_clients_served": 60,
    },
    {
        "name": "Lawyer C",
        "domain": "Criminal Defense",
        "fee_capability": "High",
        "expertise": "Criminal Law",
        "years_of_exp": 12,
        "rating": 4.7,
        "num_documents_reviewed": 80,
        "average_response_time": 3,
        "num_clients_served": 45,
    },
]


user_domain = "Criminal Defense"
user_fee_capability = "High"

filtered_lawyers = [lawyer for lawyer in lawyers_data if lawyer["domain"] == user_domain and lawyer["fee_capability"] == user_fee_capability]

filtered_lawyers.sort(key=lambda x: (-x["rating"], -x["years_of_exp"], -x["num_documents_reviewed"], x["average_response_time"], -x["num_clients_served"]))

top_5_lawyers = filtered_lawyers[:5]

for i, lawyer in enumerate(top_5_lawyers, start=1):
    print(f"Top {i} Lawyer: {lawyer['name']}")
    print(f"Expertise: {lawyer['expertise']}")
    print(f"Years of Experience: {lawyer['years_of_exp']} years")
    print(f"Rating: {lawyer['rating']}")
    print(f"Number of Documents Reviewed: {lawyer['num_documents_reviewed']}")
    print(f"Average Response Time: {lawyer['average_response_time']} hours")
    print(f"Number of Clients Served: {lawyer['num_clients_served']}")
    print()


Top 1 Lawyer: Lawyer A
Expertise: Criminal Law
Years of Experience: 10 years
Rating: 4.8
Number of Documents Reviewed: 100
Average Response Time: 2 hours
Number of Clients Served: 50

Top 2 Lawyer: Lawyer C
Expertise: Criminal Law
Years of Experience: 12 years
Rating: 4.7
Number of Documents Reviewed: 80
Average Response Time: 3 hours
Number of Clients Served: 45



### ML

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import random

In [None]:
def generate_lawyer():
    name = f"Lawyer {chr(random.randint(65, 90))}"
    domain = random.choice(["Criminal Defense", "Family Law", "Immigration Law", "Real Estate", "Employment Law", "Estate Planning", "Intellectual Property"])
    fee_capability = random.choice(["Low", "Medium", "High"])
    expertise = domain
    years_of_exp = random.randint(1, 20)
    rating = round(random.uniform(3.0, 5.0), 1)
    num_documents_reviewed = random.randint(50, 150)
    average_response_time = round(random.uniform(1.0, 4.0), 1)
    num_clients_served = random.randint(20, 100)

    return {
        "name": name,
        "domain": domain,
        "fee_capability": fee_capability,
        "expertise": expertise,
        "years_of_exp": years_of_exp,
        "rating": rating,
        "num_documents_reviewed": num_documents_reviewed,
        "average_response_time": average_response_time,
        "num_clients_served": num_clients_served,
    }

In [None]:
lawyers_data = [generate_lawyer() for _ in range(5000)]
df = pd.DataFrame(lawyers_data)

In [None]:
df.head

<bound method NDFrame.head of           name           domain fee_capability        expertise  years_of_exp  \
0     Lawyer F  Estate Planning           High  Estate Planning            19   
1     Lawyer W      Real Estate            Low      Real Estate             1   
2     Lawyer O  Estate Planning         Medium  Estate Planning            18   
3     Lawyer D  Estate Planning            Low  Estate Planning             8   
4     Lawyer F  Immigration Law         Medium  Immigration Law             5   
...        ...              ...            ...              ...           ...   
4995  Lawyer X  Immigration Law            Low  Immigration Law            16   
4996  Lawyer O       Family Law            Low       Family Law            17   
4997  Lawyer F      Real Estate           High      Real Estate            17   
4998  Lawyer J      Real Estate           High      Real Estate             2   
4999  Lawyer E      Real Estate            Low      Real Estate             6  

In [None]:
features = [
    "years_of_exp",
    "rating",
    "num_documents_reviewed",
    "average_response_time",
    "num_clients_served",
]


In [None]:
user_domain = input("Enter the domain: ")
user_fee_capability = input("Enter the fee capability(Low, Medium, High): ")

Enter the domain: Real Estate
Enter the fee capability (Low, Medium, High): Low


In [None]:
filtered_lawyers = df[(df["domain"]== user_domain) & (df["fee_capability"] == user_fee_capability)].copy()
filtered_lawyers

Unnamed: 0,name,domain,fee_capability,expertise,years_of_exp,rating,num_documents_reviewed,average_response_time,num_clients_served
1,Lawyer W,Real Estate,Low,Real Estate,1,4.0,63,2.6,87
35,Lawyer X,Real Estate,Low,Real Estate,6,3.6,138,1.8,37
72,Lawyer Z,Real Estate,Low,Real Estate,5,5.0,141,2.6,71
109,Lawyer C,Real Estate,Low,Real Estate,20,4.3,88,2.0,43
164,Lawyer B,Real Estate,Low,Real Estate,7,4.1,101,2.1,84
...,...,...,...,...,...,...,...,...,...
4904,Lawyer J,Real Estate,Low,Real Estate,11,3.5,137,2.2,85
4918,Lawyer B,Real Estate,Low,Real Estate,18,3.5,129,3.6,68
4945,Lawyer W,Real Estate,Low,Real Estate,18,4.8,61,1.4,79
4988,Lawyer C,Real Estate,Low,Real Estate,9,4.5,121,3.2,22


In [None]:
if filtered_lawyers.empty:
    print("No lawyers match the provided criteriaa")
else:
    model = RandomForestRegressor(random_state=42)
    model.fit(filtered_lawyers[features], filtered_lawyers.index)

    filtered_lawyers.loc[:, "predicted_rank"] = model.predict(filtered_lawyers[features])
    filtered_lawyers['percentile_rank'] = (1 - filtered_lawyers['predicted_rank'].rank(pct=True)).round(2)
    filtered_lawyers.sort_values(by="predicted_rank", ascending=True, inplace=True)

    top_5_suggested_lawyers = filtered_lawyers.head(20)
    print("\nTop 5 Suggested Lawyers:  ")
    for i, row in top_5_suggested_lawyers.iterrows():
        print(f"Name: {row['name']}")
        print(f"Expertise: {row['expertise']}")
        print(f"Years of Experience: {row['years_of_exp']} years")
        print(f"Rating: {row['rating']}")
        print(f"Number of Documents Reviewed: {row['num_documents_reviewed']}")
        print(f"Average Response Time: {row['average_response_time']} hours")
        print(f"Number of Clients Served: {row['num_clients_served']}")
        print(f"Normalized Rank: {row['percentile_rank']:.2f}")
        print()



Top 5 Suggested Lawyers:
Name: Lawyer Z
Expertise: Real Estate
Years of Experience: 5 years
Rating: 5.0
Number of Documents Reviewed: 141
Average Response Time: 2.6 hours
Number of Clients Served: 71
Normalized Rank: 1.00

Name: Lawyer W
Expertise: Real Estate
Years of Experience: 1 years
Rating: 4.0
Number of Documents Reviewed: 63
Average Response Time: 2.6 hours
Number of Clients Served: 87
Normalized Rank: 0.99

Name: Lawyer B
Expertise: Real Estate
Years of Experience: 7 years
Rating: 4.1
Number of Documents Reviewed: 101
Average Response Time: 2.1 hours
Number of Clients Served: 84
Normalized Rank: 0.99

Name: Lawyer B
Expertise: Real Estate
Years of Experience: 3 years
Rating: 4.2
Number of Documents Reviewed: 87
Average Response Time: 2.9 hours
Number of Clients Served: 75
Normalized Rank: 0.98

Name: Lawyer Q
Expertise: Real Estate
Years of Experience: 8 years
Rating: 4.3
Number of Documents Reviewed: 75
Average Response Time: 2.9 hours
Number of Clients Served: 73
Normalized