In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

In [2]:
train = pd.read_excel('data/DoctorFee/Final_Train.xlsx')
test = pd.read_excel('data/DoctorFee/Final_Test.xlsx')

In [3]:
train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees
0,"BHMS, MD - Homeopathy",24 years experience,100%,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100
1,"BAMS, MD - Ayurveda Medicine",12 years experience,98%,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350
2,"MBBS, MS - Otorhinolaryngology",9 years experience,,"Mathikere - BEL, Bangalore",ENT Specialist,,300
3,"BSc - Zoology, BAMS",12 years experience,,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250
4,BAMS,20 years experience,100%,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250


In [4]:
round(train.isnull().sum()/len(train)*100,2)

Qualification          0.00
Experience             0.00
Rating                55.39
Place                  0.42
Profile                0.00
Miscellaneous_Info    43.95
Fees                   0.00
dtype: float64

In [5]:
round(test.isnull().sum()/len(test)*100,2)

Qualification          0.00
Experience             0.00
Rating                54.86
Place                  0.30
Profile                0.00
Miscellaneous_Info    41.97
dtype: float64

In [6]:
# Extract years of experience
train["Experience"] = train["Experience"].str.split()
train["Experience"] = train["Experience"].str[0].astype("int")


test["Experience"] = test["Experience"].str.split()
test["Experience"] = test["Experience"].str[0].astype("int")

In [7]:
# Extract cities and state
train["Place"].fillna("Unknown,Unknown",inplace=True)
train["Place"] = train["Place"].str.split(",")
train["City"] = train["Place"].str[-1]
train["Place"] = train["Place"].str[0]


test["Place"].fillna("Unknown,Unknown",inplace=True)
test["Place"] = test["Place"].str.split(",")
test["City"] = test["Place"].str[-1]
test["Place"] = test["Place"].str[0]

In [8]:
# Seperate Ratings into bins
train["Rating"].fillna("-99%",inplace=True)
train["Rating"] = train["Rating"].str[:-1].astype("int")

bins = [-99,0,10,20,30,40,50,60,70,80,90,100]
labels = [i for i in range(11)]
train["Rating"] = pd.cut(train["Rating"],bins=bins,labels=labels,include_lowest=True)


# Seperate Ratings into bins
test["Rating"].fillna("-99%",inplace=True)
test["Rating"] = test["Rating"].str[:-1].astype("int")

bins = [-99,0,10,20,30,40,50,60,70,80,90,100]
labels = [i for i in range(11)]
test["Rating"] = pd.cut(test["Rating"],bins=bins,labels=labels,include_lowest=True)

In [9]:
train["Rating"].value_counts().sort_index()

0     3302
1        1
2        0
3        0
4        4
5        3
6       19
7       32
8       98
9      280
10    2222
Name: Rating, dtype: int64

In [10]:
test["Rating"].value_counts().sort_index()

0     1090
1        0
2        0
3        0
4        0
5        0
6        9
7       14
8       27
9      108
10     739
Name: Rating, dtype: int64

In [11]:
# Extract relevant qualification
train["Qualification"]=train["Qualification"].str.split(",")
Qualification ={}
for x in train["Qualification"].values:
    for each in x:
        each = each.strip()
        if each in Qualification:
            Qualification[each]+=1
        else:
            Qualification[each]=1

In [12]:
train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City
0,"[BHMS, MD - Homeopathy]",24,10,Kakkanad,Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam
1,"[BAMS, MD - Ayurveda Medicine]",12,10,Whitefield,Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore
2,"[MBBS, MS - Otorhinolaryngology]",9,0,Mathikere - BEL,ENT Specialist,,300,Bangalore
3,"[BSc - Zoology, BAMS]",12,0,Bannerghatta Road,Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore
4,[BAMS],20,10,Keelkattalai,Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai


### Pre-processing

In [13]:
#Identifying Top 10 used qualification
most_qua = sorted(Qualification.items(),key=lambda x:x[1],reverse=True)[:10]
final_qua =[]
for tup in most_qua:
    final_qua.append(tup[0])

In [14]:
for title in final_qua:
    train[title]=0
    
for x,y in zip(train["Qualification"].values,np.array([idx for idx in range(len(train))])):
    for q in x:
        q = q.strip()
        if q in final_qua:
            train[q][y] = 1
            
#df.drop("Qualification",axis=1,inplace=True)

In [15]:
train

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,MBBS,BDS,BAMS,BHMS,MD - Dermatology,MS - ENT,Venereology & Leprosy,MD - General Medicine,Diploma in Otorhinolaryngology (DLO),MD - Homeopathy
0,"[BHMS, MD - Homeopathy]",24,10,Kakkanad,Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,0,0,0,1,0,0,0,0,0,1
1,"[BAMS, MD - Ayurveda Medicine]",12,10,Whitefield,Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,0,0,1,0,0,0,0,0,0,0
2,"[MBBS, MS - Otorhinolaryngology]",9,0,Mathikere - BEL,ENT Specialist,,300,Bangalore,1,0,0,0,0,0,0,0,0,0
3,"[BSc - Zoology, BAMS]",12,0,Bannerghatta Road,Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,0,0,1,0,0,0,0,0,0,0
4,[BAMS],20,10,Keelkattalai,Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,0,0,1,0,0,0,0,0,0,0
5,[BAMS],8,0,Porur,Ayurveda,,100,Chennai,0,0,1,0,0,0,0,0,0,0
6,[BHMS],42,0,Karol Bagh,Homeopath,,200,Delhi,0,0,0,1,0,0,0,0,0,0
7,[BDS],10,10,Arekere,Dentist,Dental Fillings Crowns and Bridges Fixing Impa...,200,Bangalore,0,1,0,0,0,0,0,0,0,0
8,"[MBBS, MD - General Medicine]",14,0,Old City,General Medicine,,100,Hyderabad,1,0,0,0,0,0,0,1,0,0
9,"[BSc, BDS]",23,0,Athani,Dentist,,100,Ernakulam,0,1,0,0,0,0,0,0,0,0


In [16]:
test["Qualification"]=test["Qualification"].str.split(",")
Qualification ={}
for x in test["Qualification"].values:
    for each in x:
        each = each.strip()
        if each in Qualification:
            Qualification[each]+=1
        else:
            Qualification[each]=1

In [17]:
most_qua = sorted(Qualification.items(),key=lambda x:x[1],reverse=True)[:10]
final_qua =[]
for tup in most_qua:
    final_qua.append(tup[0])

In [18]:
for title in final_qua:
    test[title]=0
    
for x,y in zip(test["Qualification"].values,np.array([idx for idx in range(len(test))])):
    for q in x:
        q = q.strip()
        if q in final_qua:
            test[q][y] = 1

In [19]:
test

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,City,MBBS,BDS,BHMS,BAMS,MD - Dermatology,MS - ENT,MD - General Medicine,Venereology & Leprosy,Diploma in Otorhinolaryngology (DLO),DDVL
0,[MBBS],35,0,Ghatkopar East,General Medicine,,Mumbai,1,0,0,0,0,0,0,0,0,0
1,"[MBBS, Diploma in Otorhinolaryngology (DLO)]",31,0,West Marredpally,ENT Specialist,,Hyderabad,1,0,0,0,0,0,0,0,1,0
2,"[MBBS, DDVL]",40,7,KK Nagar,Dermatologists,"70% 4 Feedback KK Nagar, Chennai",Chennai,1,0,0,0,0,0,0,0,0,1
3,[BAMS],0,0,New Ashok Nagar,Ayurveda,,Delhi,0,0,0,1,0,0,0,0,0,0
4,"[BDS, MDS - Conservative Dentistry & Endodont...",16,10,Kanakpura Road,Dentist,General Dentistry Conservative Dentistry Cosme...,Bangalore,0,1,0,0,0,0,0,0,0,0
5,"[BDS, MDS]",14,9,Velachery,Dentist,Acrylic Partial Denture Impaction / Impacted T...,Chennai,0,1,0,0,0,0,0,0,0,0
6,"[MBBS, Diploma in Otorhinolaryngology (DLO)]",23,10,Frazer Town,ENT Specialist,"94% 6 Feedback Frazer Town, Bangalore",Bangalore,1,0,0,0,0,0,0,0,1,0
7,"[BDS, MDS - Pedodontics]",9,10,Attapur,Dentist,RCT - Root Canal Treatment Ceramic Veneers / C...,Hyderabad,0,1,0,0,0,0,0,0,0,0
8,"[MD - Ayurveda Medicine, BAMS, Yoga Teachers...",11,10,Banashankari,Ayurveda,"99% 203 Feedback Banashankari, Bangalore",Bangalore,0,0,0,1,0,0,0,0,0,0
9,[BHMS],44,0,Mayur Vihar Ph-I,Homeopath,,Delhi,0,0,1,0,0,0,0,0,0,0


In [20]:
train.drop("Qualification",axis=1,inplace=True)
test.drop("Qualification",axis=1,inplace=True)

In [21]:
train["City"].value_counts()

 Bangalore             1258
 Mumbai                1219
 Delhi                 1185
 Hyderabad              951
 Chennai                855
 Coimbatore             228
 Ernakulam              153
 Thiruvananthapuram      86
Unknown                  25
e                         1
Name: City, dtype: int64

In [22]:
train["City"][3980] = "Unknown"
train["Place"][3980] = "Unknown"

In [23]:
test['City'].value_counts()

 Bangalore             420
 Delhi                 417
 Mumbai                389
 Hyderabad             333
 Chennai               287
 Coimbatore             70
 Ernakulam              48
 Thiruvananthapuram     17
Unknown                  6
Name: City, dtype: int64

In [24]:
# Get dummies
train = pd.get_dummies(train,columns=["City","Profile"],prefix=["City","Profile"])

In [25]:
# Get dummies
test = pd.get_dummies(test,columns=["City","Profile"],prefix=["City","Profile"])

In [26]:
train["Miscellaneous_Info"]

0                    100% 16 Feedback Kakkanad, Ernakulam
1                   98% 76 Feedback Whitefield, Bangalore
2                                                     NaN
3       Bannerghatta Road, Bangalore ₹250 Available on...
4                   100% 4 Feedback Keelkattalai, Chennai
5                                                     NaN
6                                                     NaN
7       Dental Fillings Crowns and Bridges Fixing Impa...
8                                                     NaN
9                                                     NaN
10                                                    NaN
11                                                    NaN
12      98% 14 Feedback Coimbatore Racecourse, Coimbatore
13          Dental Crowns Facet Dental Dental prophylaxis
14                                                    NaN
15                                                    NaN
16                                                    NaN
17            

In [27]:
train.drop("Miscellaneous_Info",axis=1,inplace=True)

In [28]:
train.head()

Unnamed: 0,Experience,Rating,Place,Fees,MBBS,BDS,BAMS,BHMS,MD - Dermatology,MS - ENT,...,City_ Hyderabad,City_ Mumbai,City_ Thiruvananthapuram,City_Unknown,Profile_Ayurveda,Profile_Dentist,Profile_Dermatologists,Profile_ENT Specialist,Profile_General Medicine,Profile_Homeopath
0,24,10,Kakkanad,100,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,12,10,Whitefield,350,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,9,0,Mathikere - BEL,300,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,12,0,Bannerghatta Road,250,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,20,10,Keelkattalai,250,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
test.drop("Miscellaneous_Info",axis=1,inplace=True)

In [30]:
test.head()

Unnamed: 0,Experience,Rating,Place,MBBS,BDS,BHMS,BAMS,MD - Dermatology,MS - ENT,MD - General Medicine,...,City_ Hyderabad,City_ Mumbai,City_ Thiruvananthapuram,City_Unknown,Profile_Ayurveda,Profile_Dentist,Profile_Dermatologists,Profile_ENT Specialist,Profile_General Medicine,Profile_Homeopath
0,35,0,Ghatkopar East,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,31,0,West Marredpally,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,40,7,KK Nagar,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,New Ashok Nagar,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,16,10,Kanakpura Road,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [56]:
X = train.drop("Fees",axis=1)
y = train["Fees"]

# Encoding
enc = OrdinalEncoder()
X = enc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X)

### Supported Vector Machine

In [33]:
def score(y_pred,y):
    y_pred = np.log(y_pred)
    y = np.log(y)
    return 1 - ((np.sum((y_pred-y)**2))/len(y))**1/2

In [34]:
# Define own scorer
scorer = make_scorer(score,greater_is_better=True)

In [35]:
# support vector machine 
from sklearn.svm import SVR
m = SVR(gamma="scale")
m.fit(scaler.transform(X_train),y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [57]:
# Prediction
y_pred = m.predict(scaler.transform(X_test))
score(y_pred,y_test)

0.7703987768124592

#### Predicting Test values and need to submit

In [59]:
# Encoding
enc = OrdinalEncoder()
X = enc.fit_transform(test)

# feature scaling
scaler = StandardScaler()
X_act_test = scaler.fit_transform(X)

In [61]:
y_test_pred = m.predict(scaler.transform(X_act_test))
print(y_test_pred)

[281.35798686 292.31217896 268.62781962 ... 268.68898028 248.58821208
 294.02063843]
