In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('data\doctor_recommendation_system.csv') 

In [4]:
df.head()

Unnamed: 0,Doctor Name,Specialty,Hospital,Insurance Provider,Gender,Medical Condition,City,Address,Age,Years of Experience,Consultation Fees ($),Billing Amount ($),Rating,Number of Reviews
0,Dr. Patel,Dermatology,General Hospital,United Healthcare,Male,COPD,San Jose,123 Main St,68,36,472,7551,1.6,167
1,Dr. Martinez,Orthopedics,Health Clinic,Aetna,Male,Depression,Phoenix,404 Birch St,69,8,335,557,1.3,239
2,Dr. Williams,Psychiatry,Health Clinic,Aetna,Female,Cancer,Philadelphia,606 Walnut St,53,4,431,915,4.6,230
3,Dr. Martinez,Orthopedics,City Hospital,Aetna,Male,Arthritis,Phoenix,456 Elm St,28,24,417,9783,2.0,162
4,Dr. Smith,Endocrinology,Health Clinic,United Healthcare,Female,Heart Disease,San Jose,101 Oak St,37,2,65,2339,3.0,217


In [5]:
df.shape

(5000, 14)

### 2. Dataset Information

### 3. Data Checks to perform 
* Check Missing 
* Check Duplicates
* Check data type
* Check the number of unique values of each column
* Check statistics of dataset
* Check various categories present in the different categorical column

### 3.1 Check Missing Values 

In [6]:
df.isna().sum()

Doctor Name              0
Specialty                0
Hospital                 0
Insurance Provider       0
Gender                   0
Medical Condition        0
City                     0
Address                  0
Age                      0
Years of Experience      0
Consultation Fees ($)    0
Billing Amount ($)       0
Rating                   0
Number of Reviews        0
dtype: int64

There are no missing values in the dataset

### 3.2 Check Duplicates

In [7]:
print(df.duplicated().sum()) 

0


There are no duplicate values in the dataset

### 3.3 Check data types

In [8]:
# Check Null and Dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Doctor Name            5000 non-null   object 
 1   Specialty              5000 non-null   object 
 2   Hospital               5000 non-null   object 
 3   Insurance Provider     5000 non-null   object 
 4   Gender                 5000 non-null   object 
 5   Medical Condition      5000 non-null   object 
 6   City                   5000 non-null   object 
 7   Address                5000 non-null   object 
 8   Age                    5000 non-null   int64  
 9   Years of Experience    5000 non-null   int64  
 10  Consultation Fees ($)  5000 non-null   int64  
 11  Billing Amount ($)     5000 non-null   int64  
 12  Rating                 5000 non-null   float64
 13  Number of Reviews      5000 non-null   int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 547.0+ 

### 3.4 Checking the number of unique values of each column

In [9]:
df.nunique()

Doctor Name                10
Specialty                  10
Hospital                    5
Insurance Provider          6
Gender                      2
Medical Condition          10
City                       10
Address                    10
Age                        51
Years of Experience        40
Consultation Fees ($)     451
Billing Amount ($)       3922
Rating                     41
Number of Reviews         501
dtype: int64

### 3.5 Check statistics of data set

In [10]:
df.describe()

Unnamed: 0,Age,Years of Experience,Consultation Fees ($),Billing Amount ($),Rating,Number of Reviews
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,44.7924,20.3912,274.3628,5040.9368,2.99138,252.4868
std,14.662711,11.64325,131.06698,2858.798044,1.151844,144.124246
min,20.0,1.0,50.0,103.0,1.0,0.0
25%,32.0,10.0,158.75,2550.75,2.0,128.0
50%,45.0,20.0,275.0,5057.0,3.0,253.0
75%,57.0,31.0,390.0,7487.5,4.0,377.0
max,70.0,40.0,500.0,10000.0,5.0,500.0


<b>3.6 Insight</b>
* From above description of numerical data, we can see that the mean age of doctors is 44,
  the mean years of experience in 20, consultation fees is approximately $270 and billing amount is $5000.
  

### Checking the data types

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Doctor Name            5000 non-null   object 
 1   Specialty              5000 non-null   object 
 2   Hospital               5000 non-null   object 
 3   Insurance Provider     5000 non-null   object 
 4   Gender                 5000 non-null   object 
 5   Medical Condition      5000 non-null   object 
 6   City                   5000 non-null   object 
 7   Address                5000 non-null   object 
 8   Age                    5000 non-null   int64  
 9   Years of Experience    5000 non-null   int64  
 10  Consultation Fees ($)  5000 non-null   int64  
 11  Billing Amount ($)     5000 non-null   int64  
 12  Rating                 5000 non-null   float64
 13  Number of Reviews      5000 non-null   int64  
dtypes: float64(1), int64(5), object(8)
memory usage: 547.0+ 

### 3.7 Exploring Data

In [12]:
print("Categories in specialty: ", end=" ")
print(df['Specialty'].unique())
print('\n')
print("Categories in hospital: ", end=" ")
print(df['Hospital'].unique())
print('\n')
print("Categories in hospital: ", end=" ")
print(df['Hospital'].unique())
print('\n')
print("Categories in insurance: ", end=" ")
print(df['Insurance Provider'].unique())
print('\n')
print("Categories in medical condition: ", end=" ")
print(df['Medical Condition'].unique())


Categories in specialty:  ['Dermatology' 'Orthopedics' 'Psychiatry' 'Endocrinology'
 'Gastroenterology' 'Oncology' 'Cardiology' 'Ophthalmology' 'Pediatrics'
 'Neurology']


Categories in hospital:  ['General Hospital' 'Health Clinic' 'City Hospital' 'Specialty Hospital'
 'Regional Medical Center']


Categories in hospital:  ['General Hospital' 'Health Clinic' 'City Hospital' 'Specialty Hospital'
 'Regional Medical Center']


Categories in insurance:  ['United Healthcare' 'Aetna' 'Medicaid' 'Cigna' 'Medicare' 'Blue Cross']


Categories in medical condition:  ['COPD' 'Depression' 'Cancer' 'Arthritis' 'Heart Disease' 'Diabetes'
 'Migraine' 'Hypertension' 'Asthma' 'Obesity']


In [13]:
# define numerical & categorical columns

numeric_features = [feature for feature in df.columns if df[feature].dtype !=0]
categorical_features = [feature for feature in df.columns if df[feature].dtype == '0']

# print columns
print('We have {} numerical features: {}'.format(len(numeric_features),numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))


We have 14 numerical features: ['Doctor Name', 'Specialty', 'Hospital', 'Insurance Provider', 'Gender', 'Medical Condition', 'City', 'Address', 'Age', 'Years of Experience', 'Consultation Fees ($)', 'Billing Amount ($)', 'Rating', 'Number of Reviews']

We have 0 categorical features : []


### 3.8 Adding columns for "Total Score" and "Average"

### 4. Exploring Data (Visualization)

### 4.1 Visualize average score distribution to make some conclusion

* Histogram
* Kernal Distribution Function (KDE)

### 4.1.1 Histogram & KDE