In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [23]:
# here I am reading the CSV fiel and converting it to dataframe of pandas for ease of use
df = pd.read_csv("city-wise-govt-clinic.csv")
print(df.shape)
df.head()

(498, 10)


Unnamed: 0,CityCode,ClinicCenterCode,Category,DoctorCount,Latitude,Longitude,ClinicName,CityName,ClinicNumber,ClinicCenterAddress
0,10,AH01,Allopathy,4,23.035601,72.56946002,,Ahmedabad,079-26577393,"1st Floor, S.R.House, Opp. Mount Carmel School..."
1,10,AH02,Allopathy,2,23.012685,72.523626\n ...,,Ahmedabad,079-25693262,"Room no.1,2 and 3 (Ground floor), Aayakar Bhaw..."
2,10,AH04,Allopathy,4,23.067056,72.555033\n ...,,Ahmedabad,079-27478393,"Govt Qtr No. M/49/289-290, Near sindur party p..."
3,10,AH06,Allopathy,0,23.035601,72.56946002,,Ahmedabad,079-26577393,"3rd Floor, S.R.House, Opp. Mount Carmel Schoo..."
4,10,AH09,Allopathy,3,22.998966,72.601363,,Ahmedabad,079-25467055,"2nd Floor, Business Square, Opp. Punjab Nation..."


In [24]:
df.replace(['*', 'NA', 'null', 'NULL', 'NaN'], np.nan, inplace=True)
# replace this with null value for proper cleaning

In [25]:
df.isnull().sum()

# here we can see no ClinicName is present in any column so we will delete that useless column,
# very few latitutde and lngitude are missing for now the lapse can be ignored because we have a city colum,n
# we delete every row where ClinicNumber is missing beacuse it seems the uniquely identifing column i.e. similar to Primary Key,
# some rows also have missing address or number we shall let it be ,
# But if any row misses both address and contact number, we will remove them too
# Many might overlook but some clinics have 0 doctors we shall remove them too

Unnamed: 0,0
CityCode,0
ClinicCenterCode,8
Category,0
DoctorCount,0
Latitude,30
Longitude,15
ClinicName,498
CityName,0
ClinicNumber,17
ClinicCenterAddress,9


In [26]:
df.drop(columns=['ClinicName'], inplace=True)

In [27]:
# drop rows where
df = df.dropna(subset=['ClinicCenterCode'])

In [28]:
# Drop rows where both are missing
df = df.dropna(subset=['ClinicNumber', 'ClinicCenterAddress'], how='all')

In [29]:
# Remove rows where DoctorCount is 0
df = df[df['DoctorCount'] != 0].copy()

In [30]:
df.head()

Unnamed: 0,CityCode,ClinicCenterCode,Category,DoctorCount,Latitude,Longitude,CityName,ClinicNumber,ClinicCenterAddress
0,10,AH01,Allopathy,4,23.035601,72.56946002,Ahmedabad,079-26577393,"1st Floor, S.R.House, Opp. Mount Carmel School..."
1,10,AH02,Allopathy,2,23.012685,72.523626\n ...,Ahmedabad,079-25693262,"Room no.1,2 and 3 (Ground floor), Aayakar Bhaw..."
2,10,AH04,Allopathy,4,23.067056,72.555033\n ...,Ahmedabad,079-27478393,"Govt Qtr No. M/49/289-290, Near sindur party p..."
4,10,AH09,Allopathy,3,22.998966,72.601363,Ahmedabad,079-25467055,"2nd Floor, Business Square, Opp. Punjab Nation..."
5,10,VDA,Allopathy,2,22.302678,73.224506\n ...,Ahmedabad,0265-2415555,"Panigate Telephone exchange Compound, Ground f..."


In [31]:
# Convert numeric columns for better understanding of the ChatBot
df['CityCode'] = pd.to_numeric(df['CityCode'], errors='coerce').astype('Int64')
df['DoctorCount'] = pd.to_numeric(df['DoctorCount'], errors='coerce').astype('Int64')
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

In [32]:
# Standardize text
df.loc[:, 'Category'] = df['Category'].str.strip().str.title()
df.loc[:, 'CityName'] = df['CityName'].str.strip().str.title()
df.loc[:, 'ClinicCenterAddress'] = df['ClinicCenterAddress'].str.strip()

In [33]:
# Encoding Category Clomuns

encoder = LabelEncoder()

# Encode category
df['Category_Code'] = encoder.fit_transform(df['Category'])

# Save mapping for reference
category_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Category Mapping:", category_mapping)

Category Mapping: {'Allopathy': np.int64(0), 'Ayurveda': np.int64(1), 'Homeopathy': np.int64(2), 'Siddha': np.int64(3), 'Unani': np.int64(4)}


In [34]:

df.drop(columns=['Category'], inplace=True)
df.head()

Unnamed: 0,CityCode,ClinicCenterCode,DoctorCount,Latitude,Longitude,CityName,ClinicNumber,ClinicCenterAddress,Category_Code
0,10,AH01,4,23.035601,72.56946,Ahmedabad,079-26577393,"1st Floor, S.R.House, Opp. Mount Carmel School...",0
1,10,AH02,2,23.012685,72.523626,Ahmedabad,079-25693262,"Room no.1,2 and 3 (Ground floor), Aayakar Bhaw...",0
2,10,AH04,4,23.067056,72.555033,Ahmedabad,079-27478393,"Govt Qtr No. M/49/289-290, Near sindur party p...",0
4,10,AH09,3,22.998966,72.601363,Ahmedabad,079-25467055,"2nd Floor, Business Square, Opp. Punjab Nation...",0
5,10,VDA,2,22.302678,73.224506,Ahmedabad,0265-2415555,"Panigate Telephone exchange Compound, Ground f...",0


In [37]:
# Save cleaned dataset
df.to_csv("cleandata.csv", index=False)

In [38]:
'''
Furthermore, we can use the latitude and longitude to determine the locality of the clinic in the city,
with this latitude and longitude the chabto will also be able to suggest the nearest government clinic in the city.
'''

'\nFurthermore, we can use the latitude and longitude to determine the locality of the clinic in the city,\nwith this latitude and longitude the chabto will also be able to suggest the nearest government clinic in the city.\n'