In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_train = pd.read_excel('Final_Train.xlsx')
df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees
0,"BHMS, MD - Homeopathy",24 years experience,100%,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100
1,"BAMS, MD - Ayurveda Medicine",12 years experience,98%,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350
2,"MBBS, MS - Otorhinolaryngology",9 years experience,,"Mathikere - BEL, Bangalore",ENT Specialist,,300
3,"BSc - Zoology, BAMS",12 years experience,,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250
4,BAMS,20 years experience,100%,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250


In [3]:
np.shape(df_train)

(5961, 7)

In [4]:
df_test = pd.read_excel('Final_Test.xlsx')
np.shape(df_test)

(1987, 6)

In [5]:
# df_test['Len']  = df_test['Qualification'].apply(lambda x : len(x))
# result = df_test.sort_values(['Len'], ascending = False)
# result.iloc[0]['Qualification']

In [6]:
df_train['City'] = df_train['Place'].str.split(', ').str[-1]
df_train['Area'] = df_train['Place'].str.split(', ').str[0]


df_test['City'] = df_test['Place'].str.split(', ').str[-1]
df_test['Area'] = df_test['Place'].str.split(', ').str[0]

df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area
0,"BHMS, MD - Homeopathy",24 years experience,100%,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,Kakkanad
1,"BAMS, MD - Ayurveda Medicine",12 years experience,98%,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,Whitefield
2,"MBBS, MS - Otorhinolaryngology",9 years experience,,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,Mathikere - BEL
3,"BSc - Zoology, BAMS",12 years experience,,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,Bannerghatta Road
4,BAMS,20 years experience,100%,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,Keelkattalai


In [7]:
count = df_train.groupby(['City']).count()
count.Qualification

City
Bangalore             1258
Chennai                855
Coimbatore             228
Delhi                 1185
Ernakulam              153
Hyderabad              951
Mumbai                1219
Thiruvananthapuram      86
e                        1
Name: Qualification, dtype: int64

In [8]:
val = df_train[df_train['City'] == 'e'].index
val

Int64Index([3980], dtype='int64')

In [9]:
df_train = df_train.drop([3980])

In [10]:
df_train['Experience'] = df_train['Experience'].apply(lambda x : re.findall(r'^\d\d?', x)[0]).astype(int)

df_test['Experience'] = df_test['Experience'].apply(lambda x : re.findall(r'^\d\d?', x)[0]).astype(int)

In [11]:
df_train['Rating'].fillna('0%',inplace = True)
df_train['Rating'] = df_train['Rating'].str[0:-1].astype(float)

df_test['Rating'].fillna('0%',inplace = True)
df_test['Rating'] = df_test['Rating'].str[0:-1].astype(float)

df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area
0,"BHMS, MD - Homeopathy",24,100.0,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,Kakkanad
1,"BAMS, MD - Ayurveda Medicine",12,98.0,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,Whitefield
2,"MBBS, MS - Otorhinolaryngology",9,0.0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,Mathikere - BEL
3,"BSc - Zoology, BAMS",12,0.0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,Bannerghatta Road
4,BAMS,20,100.0,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,Keelkattalai


In [12]:
df_train['Profile'].unique()

array(['Homeopath', 'Ayurveda', 'ENT Specialist', 'Dentist',
       'General Medicine', 'Dermatologists'], dtype=object)

In [13]:
temp = pd.get_dummies(df_train[['Profile', 'City']], prefix_sep = '_', prefix = ['Profile', 'City'])
df_train = pd.concat([df_train, temp], axis=1, join = 'inner')
df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area,Profile_Ayurveda,...,Profile_General Medicine,Profile_Homeopath,City_Bangalore,City_Chennai,City_Coimbatore,City_Delhi,City_Ernakulam,City_Hyderabad,City_Mumbai,City_Thiruvananthapuram
0,"BHMS, MD - Homeopathy",24,100.0,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,Kakkanad,0,...,0,1,0,0,0,0,1,0,0,0
1,"BAMS, MD - Ayurveda Medicine",12,98.0,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,Whitefield,1,...,0,0,1,0,0,0,0,0,0,0
2,"MBBS, MS - Otorhinolaryngology",9,0.0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,Mathikere - BEL,0,...,0,0,1,0,0,0,0,0,0,0
3,"BSc - Zoology, BAMS",12,0.0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,Bannerghatta Road,1,...,0,0,1,0,0,0,0,0,0,0
4,BAMS,20,100.0,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,Keelkattalai,1,...,0,0,0,1,0,0,0,0,0,0


In [14]:
temp = pd.get_dummies(df_test[['Profile', 'City']], prefix_sep = '_', prefix = ['Profile', 'City'])
df_test = pd.concat([df_test, temp], axis=1, join = 'inner')
df_test.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,City,Area,Profile_Ayurveda,Profile_Dentist,...,Profile_General Medicine,Profile_Homeopath,City_Bangalore,City_Chennai,City_Coimbatore,City_Delhi,City_Ernakulam,City_Hyderabad,City_Mumbai,City_Thiruvananthapuram
0,MBBS,35,0.0,"Ghatkopar East, Mumbai",General Medicine,,Mumbai,Ghatkopar East,0,0,...,1,0,0,0,0,0,0,0,1,0
1,"MBBS, Diploma in Otorhinolaryngology (DLO)",31,0.0,"West Marredpally, Hyderabad",ENT Specialist,,Hyderabad,West Marredpally,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"MBBS, DDVL",40,70.0,"KK Nagar, Chennai",Dermatologists,"70% 4 Feedback KK Nagar, Chennai",Chennai,KK Nagar,0,0,...,0,0,0,1,0,0,0,0,0,0
3,BAMS,0,0.0,"New Ashok Nagar, Delhi",Ayurveda,,Delhi,New Ashok Nagar,1,0,...,0,0,0,0,0,1,0,0,0,0
4,"BDS, MDS - Conservative Dentistry & Endodontics",16,100.0,"Kanakpura Road, Bangalore",Dentist,General Dentistry Conservative Dentistry Cosme...,Bangalore,Kanakpura Road,0,1,...,0,0,1,0,0,0,0,0,0,0


In [15]:
def sortQual(text):
    arr = re.sub(r'\(.*?\)', lambda x: x.group().replace(",","-"), text) # to replace ',' with '-' inside brackets only
    return (sorted(arr.lower().replace(" ","").split(",")))

In [16]:
df_train['Qualification'] = df_train['Qualification'].apply(lambda x : sortQual(x))

In [17]:
df_test['Qualification'] = df_test['Qualification'].apply(lambda x : sortQual(x))

In [18]:
df_test.iloc[1287]['Qualification']

['advancedendotonticcourseonrootcanal',
 'advancedretreatmentcourseinrct',
 'bds',
 'certificateincosmeticdentistry',
 'certificationinprosthodontics&periodontology',
 'certifiedadvancedcourseinwisdomtoothextraction',
 'certifiedcourseinoralsurgery',
 'diplomaincosmeticdentistry',
 'endodonticsandaestheticdentistry',
 'fellowofacademyofgeneraleducation(fage)',
 'fellowshipinaestheticdentistry(fad)',
 'fellowshipinclinicalcosmetology',
 'mba-hospitalmanagement',
 'mida',
 'pgdiplomainclinicalcosmetology(pgdcc)',
 'pgdiplomainconservative',
 'postgraduatecertificateinendodontics(pgce)']

In [19]:
qual_dict = {}

train_quals = []
test_quals = []

train_quals.append(list(df_train['Qualification'].values))
test_quals.append(list(df_test['Qualification'].values))

for quals in train_quals[0]:
    for item in quals:
        if item in qual_dict:
            qual_dict[item] += 1
        else:
            qual_dict[item] = 1
            
for quals in test_quals[0]:
    for item in quals:
        if item in qual_dict:
            qual_dict[item] += 1
        else:
            qual_dict[item] = 1
            
# qual_dict['Empty'] = 0
    

In [20]:
qual_df = pd.DataFrame.from_dict(dict(sorted(qual_dict.items(), key = lambda x : x[1] , reverse = True)), orient = 'index').reset_index()
qual_df.columns = ['Qualification', 'Count']
qual_df.tail()

Unnamed: 0,Qualification,Count
890,fellowshipindiabetesmanagement,1
891,frcp-internalmedicine,1
892,fais,1
893,fiages,1
894,md/ms-obstetrics&gynaecology,1


In [21]:
x = qual_df[qual_df['Qualification'] == '39yearsexperience'].index
qual_df = qual_df.drop(x)

In [22]:
qual_df["Code"] = qual_df["Qualification"].astype('category').cat.codes
qual_df.head()

Unnamed: 0,Qualification,Count,Code
0,mbbs,3788,579
1,bds,1790,31
2,bams,1007,28
3,bhms,998,33
4,md-dermatology,802,615


In [23]:
# scaler = StandardScaler()
# qual_df['Code'] = scaler.fit_transform(qual_df['Code'].values.reshape(-1,1))

In [24]:
qual_df.tail()

Unnamed: 0,Qualification,Count,Code
890,fellowshipindiabetesmanagement,1,424
891,frcp-internalmedicine,1,500
892,fais,1,364
893,fiages,1,482
894,md/ms-obstetrics&gynaecology,1,660


In [25]:
conversion_dict = dict(zip(qual_df['Qualification'], qual_df['Code']))
conversion_dict['39yearsexperience'] = -1

In [26]:
stat_test = df_test.Qualification.apply(lambda x: len(x))
stat_test.idxmax()
df_test[df_test['Qualification'].str.len()>10]['Qualification']
stat_test.idxmax()
df_test.iloc[1287]['Qualification']

['advancedendotonticcourseonrootcanal',
 'advancedretreatmentcourseinrct',
 'bds',
 'certificateincosmeticdentistry',
 'certificationinprosthodontics&periodontology',
 'certifiedadvancedcourseinwisdomtoothextraction',
 'certifiedcourseinoralsurgery',
 'diplomaincosmeticdentistry',
 'endodonticsandaestheticdentistry',
 'fellowofacademyofgeneraleducation(fage)',
 'fellowshipinaestheticdentistry(fad)',
 'fellowshipinclinicalcosmetology',
 'mba-hospitalmanagement',
 'mida',
 'pgdiplomainclinicalcosmetology(pgdcc)',
 'pgdiplomainconservative',
 'postgraduatecertificateinendodontics(pgce)']

In [27]:
stat_train = df_train.Qualification.apply(lambda x: len(x))
stat_train.max()

10

In [28]:
train_quals = []
for i in range(stat_train.max()):
    col_name = 'Qual_' + str(i+1)
    train_quals.append(col_name)
    col_i = df_train['Qualification'].str[i]
    df_train[col_name] = col_i
#     df_train[col_name] = df_train[col_name].fillna("Empty")
    
    
    
    
df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area,Profile_Ayurveda,...,Qual_1,Qual_2,Qual_3,Qual_4,Qual_5,Qual_6,Qual_7,Qual_8,Qual_9,Qual_10
0,"[bhms, md-homeopathy]",24,100.0,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,Kakkanad,0,...,bhms,md-homeopathy,,,,,,,,
1,"[bams, md-ayurvedamedicine]",12,98.0,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,Whitefield,1,...,bams,md-ayurvedamedicine,,,,,,,,
2,"[mbbs, ms-otorhinolaryngology]",9,0.0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,Mathikere - BEL,0,...,mbbs,ms-otorhinolaryngology,,,,,,,,
3,"[bams, bsc-zoology]",12,0.0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,Bannerghatta Road,1,...,bams,bsc-zoology,,,,,,,,
4,[bams],20,100.0,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,Keelkattalai,1,...,bams,,,,,,,,,


In [29]:
test_quals = []
for j in range(stat_test.max()):
    col_name = 'Qual_' + str(j+1)
    test_quals.append(col_name)
    col_j = df_test['Qualification'].str[j]
    df_test[col_name] = col_j
#     df_test[col_name] = df_test[col_name].fillna('Empty')
    
df_test

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,City,Area,Profile_Ayurveda,Profile_Dentist,...,Qual_8,Qual_9,Qual_10,Qual_11,Qual_12,Qual_13,Qual_14,Qual_15,Qual_16,Qual_17
0,[mbbs],35,0.0,"Ghatkopar East, Mumbai",General Medicine,,Mumbai,Ghatkopar East,0,0,...,,,,,,,,,,
1,"[diplomainotorhinolaryngology(dlo), mbbs]",31,0.0,"West Marredpally, Hyderabad",ENT Specialist,,Hyderabad,West Marredpally,0,0,...,,,,,,,,,,
2,"[ddvl, mbbs]",40,70.0,"KK Nagar, Chennai",Dermatologists,"70% 4 Feedback KK Nagar, Chennai",Chennai,KK Nagar,0,0,...,,,,,,,,,,
3,[bams],0,0.0,"New Ashok Nagar, Delhi",Ayurveda,,Delhi,New Ashok Nagar,1,0,...,,,,,,,,,,
4,"[bds, mds-conservativedentistry&endodontics]",16,100.0,"Kanakpura Road, Bangalore",Dentist,General Dentistry Conservative Dentistry Cosme...,Bangalore,Kanakpura Road,0,1,...,,,,,,,,,,
5,"[bds, mds]",14,90.0,"Velachery, Chennai",Dentist,Acrylic Partial Denture Impaction / Impacted T...,Chennai,Velachery,0,1,...,,,,,,,,,,
6,"[diplomainotorhinolaryngology(dlo), mbbs]",23,94.0,"Frazer Town, Bangalore",ENT Specialist,"94% 6 Feedback Frazer Town, Bangalore",Bangalore,Frazer Town,0,0,...,,,,,,,,,,
7,"[bds, mds-pedodontics]",9,94.0,"Attapur, Hyderabad",Dentist,RCT - Root Canal Treatment Ceramic Veneers / C...,Hyderabad,Attapur,0,1,...,,,,,,,,,,
8,"[bams, m.d.inkayachikista, md-ayurvedamedicine...",11,99.0,"Banashankari, Bangalore",Ayurveda,"99% 203 Feedback Banashankari, Bangalore",Bangalore,Banashankari,1,0,...,,,,,,,,,,
9,[bhms],44,0.0,"Mayur Vihar Ph-I, Delhi",Homeopath,,Delhi,Mayur Vihar Ph-I,0,0,...,,,,,,,,,,


In [30]:
for i in train_quals:
    df_train.replace({i: conversion_dict}, inplace=True)
    df_train[i] = df_train[i].fillna(-1)

In [31]:
for i in test_quals:
    df_test.replace({i: conversion_dict}, inplace=True)
    df_test[i] = df_test[i].fillna(-1)

In [32]:
df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area,Profile_Ayurveda,...,Qual_1,Qual_2,Qual_3,Qual_4,Qual_5,Qual_6,Qual_7,Qual_8,Qual_9,Qual_10
0,"[bhms, md-homeopathy]",24,100.0,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,Kakkanad,0,...,33,623,-1,-1,-1,-1,-1,-1,-1,-1
1,"[bams, md-ayurvedamedicine]",12,98.0,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,Whitefield,1,...,28,605,-1,-1,-1,-1,-1,-1,-1,-1
2,"[mbbs, ms-otorhinolaryngology]",9,0.0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,Mathikere - BEL,0,...,579,739,-1,-1,-1,-1,-1,-1,-1,-1
3,"[bams, bsc-zoology]",12,0.0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,Bannerghatta Road,1,...,28,40,-1,-1,-1,-1,-1,-1,-1,-1
4,[bams],20,100.0,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,Keelkattalai,1,...,28,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [33]:
df_train['Area'].unique()[:10]

array(['Kakkanad', 'Whitefield', 'Mathikere - BEL', 'Bannerghatta Road',
       'Keelkattalai', 'Porur', 'Karol Bagh', 'Arekere', 'Old City',
       'Athani'], dtype=object)

In [34]:
len(df_test['Area'].unique())

590

In [35]:
area_list = []
set1 = set(df_train['Area'].unique())
set2 = set(df_test['Area'].unique())
temp = list(set1.union(set2))
temp = temp[1:]

In [36]:
area_df = pd.DataFrame(temp, columns = ['Area'])
area_df['Codes'] = area_df['Area'].astype('category').cat.codes
# scaler = StandardScaler()
# area_df['Codes'] = scaler.fit_transform(area_df['Codes'].values.reshape(-1,1))
area_codes_dict = dict(zip(area_df['Area'], area_df['Codes']))

In [37]:
df_train.replace({'Area': area_codes_dict}, inplace=True)
df_test.replace({'Area': area_codes_dict}, inplace=True)

df_train['Area'] = df_train['Area'].fillna(-1)
df_test['Area'] = df_test['Area'].fillna(-1)

In [38]:
df_train.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees,City,Area,Profile_Ayurveda,...,Qual_1,Qual_2,Qual_3,Qual_4,Qual_5,Qual_6,Qual_7,Qual_8,Qual_9,Qual_10
0,"[bhms, md-homeopathy]",24,100.0,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100,Ernakulam,324.0,0,...,33,623,-1,-1,-1,-1,-1,-1,-1,-1
1,"[bams, md-ayurvedamedicine]",12,98.0,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350,Bangalore,925.0,1,...,28,605,-1,-1,-1,-1,-1,-1,-1,-1
2,"[mbbs, ms-otorhinolaryngology]",9,0.0,"Mathikere - BEL, Bangalore",ENT Specialist,,300,Bangalore,494.0,0,...,579,739,-1,-1,-1,-1,-1,-1,-1,-1
3,"[bams, bsc-zoology]",12,0.0,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250,Bangalore,74.0,1,...,28,40,-1,-1,-1,-1,-1,-1,-1,-1
4,[bams],20,100.0,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250,Chennai,367.0,1,...,28,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [39]:
df_train['Reviewers'] = df_train['Miscellaneous_Info'].str.split('% ').str[1].str.split(' ').str[0]
df_train['Reviewers'] = df_train['Reviewers'].fillna(0)
df_train['Rev_Len'] = df_train['Reviewers'].str.len()
df_train.loc[df_train['Rev_Len']>3, 'Reviewers'] = 0
df_train.loc[df_train['Reviewers']==',', 'Reviewers'] = 0
df_train['Reviewers'] = df_train['Reviewers'].astype(int)
# scaler = StandardScaler()
# df_train['Reviewers'] = scaler.fit_transform(df_train['Reviewers'].values.reshape(-1,1))
df_train['Reviewers'].head()

0    16
1    76
2     0
3     0
4     4
Name: Reviewers, dtype: int32

In [40]:
df_test['Reviewers'] = df_test['Miscellaneous_Info'].str.split('% ').str[1].str.split(' ').str[0]
df_test['Reviewers'] = df_test['Reviewers'].fillna(0)
df_test['Rev_Len'] = df_test['Reviewers'].str.len()
df_test.loc[df_test['Rev_Len']>3, 'Reviewers'] = 0
df_test.loc[df_test['Reviewers']==',', 'Reviewers'] = 0
# df_test.loc[df_test['Reviewers']== None, 'Reviewers'] = 0
df_test['Reviewers'] = df_test['Reviewers'].astype(int)
# df_test['Reviewers'] = scaler.transform(df_test['Reviewers'].values.reshape(-1,1))
df_test['Reviewers'].head()

0    0
1    0
2    4
3    0
4    0
Name: Reviewers, dtype: int32

In [41]:
selected_cols = [i for i in df_train.columns if i not in ['Qualification', 'Place', 'Miscellaneous_Info', 'City', 'Fees', 'Profile', 'Rev_Len', 'Experience', 'Rating']]

In [42]:
from sklearn.model_selection import train_test_split
X = df_train[selected_cols]
y = df_train['Fees']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [43]:
# from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

## Using SVR 

### First splitting the training set and running a mock analysis on that and then training df_train and predicting from data of df_test 

In [44]:
from sklearn.svm import SVR
reg = SVR(gamma = 0.1)
reg.fit(X_train,y_train)
ans = reg.predict(scaler.transform(X_test))


In [45]:
def score(y_pred,y):
    y_pred = np.log(y_pred)
    y = np.log(y)
    return 1 - ((np.sum((y_pred-y)**2))/len(y))**1/2

In [46]:
score(ans,y_test)

0.7806934326272256

In [47]:
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
scorer = make_scorer(score,greater_is_better=True)

In [48]:
from sklearn.model_selection import GridSearchCV

parameters = {"C":[0.1,1,10],"kernel":["linear","rbf","poly"]}
reg2 = GridSearchCV(reg,param_grid=parameters,scoring=scorer, n_jobs=-1,cv=4)

In [49]:
reg2.fit(X_train,y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma=0.1, kernel='rbf', max_iter=-1,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(score), verbose=0)

In [50]:
ans2 = reg2.predict(scaler.transform(X_test))

In [51]:
score(ans2, y_test)

0.7974440980036446

In [52]:
df_test.columns

Index(['Qualification', 'Experience', 'Rating', 'Place', 'Profile',
       'Miscellaneous_Info', 'City', 'Area', 'Profile_Ayurveda',
       'Profile_Dentist', 'Profile_Dermatologists', 'Profile_ENT Specialist',
       'Profile_General Medicine', 'Profile_Homeopath', 'City_Bangalore',
       'City_Chennai', 'City_Coimbatore', 'City_Delhi', 'City_Ernakulam',
       'City_Hyderabad', 'City_Mumbai', 'City_Thiruvananthapuram', 'Qual_1',
       'Qual_2', 'Qual_3', 'Qual_4', 'Qual_5', 'Qual_6', 'Qual_7', 'Qual_8',
       'Qual_9', 'Qual_10', 'Qual_11', 'Qual_12', 'Qual_13', 'Qual_14',
       'Qual_15', 'Qual_16', 'Qual_17', 'Reviewers', 'Rev_Len'],
      dtype='object')

In [73]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
X = scaler.fit_transform(df_train[selected_cols])
y = df_train['Fees']
X_test = scaler.transform(df_test[selected_cols])

reg3 = SVR(gamma = 0.1)
# reg3.fit(X_train,y_train)

parameters = {"C":[0.1,1,10],"kernel":["linear","rbf","poly"]}
reg4 = GridSearchCV(reg3,param_grid=parameters,scoring='r2', n_jobs=-1,cv=4)
reg4.fit(X,y)
ans = reg4.predict(X_test)

In [74]:
ansX = reg4.predict(X)
print("Training data score {}".format(score(y, ansX)))

Training data score 0.8083753797991509


## Predicted fees with SVR

In [75]:
s = pd.Series(ans)
s.values[:10]

array([198.53460804, 282.04034274, 295.30976087, 263.15395655,
       217.82807676, 200.4293666 , 366.12052028, 200.71236587,
       285.44985127, 295.82539138])

## Using KNN Regressor

### First splitting the training set and running a mock analysis on that and then training df_train and predicting from data of df_test 

In [56]:
from sklearn.model_selection import train_test_split
X = df_train[selected_cols]
y = df_train['Fees']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

poly = PolynomialFeatures(2)

In [57]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train_2 = poly.fit_transform(X_train)

In [58]:
neigh = KNeighborsRegressor(n_neighbors=20)
neigh.fit(X_train_2, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                    weights='uniform')

In [59]:
X_test = scaler.transform(X_test)
X_test_2 = poly.transform(X_test)
ans = neigh.predict(X_test_2)

In [61]:
score(ans,y_test)

0.7864151768622009

In [62]:
# from sklearn.model_selection import GridSearchCV

scaler = StandardScaler()
X = scaler.fit_transform(df_train[selected_cols])
y = df_train['Fees']
X_test = scaler.transform(df_test[selected_cols])

neigh = KNeighborsRegressor(n_neighbors=20)
neigh.fit(X, y)

ans = neigh.predict(X)

In [63]:
score(ans,y)

0.8097553069684934

In [64]:
from sklearn.model_selection import train_test_split
X = df_train[selected_cols]
y = df_train['Fees']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

poly = PolynomialFeatures(3)

In [65]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train_2 = poly.fit_transform(X_train)

X_test = scaler.transform(X_test)
X_test_2 = scaler.transform(X_test)

In [66]:
parameters = {'n_neighbors':np.arange(2,20)}

neigh = KNeighborsRegressor()

reg = GridSearchCV(neigh,param_grid=parameters,scoring=scorer, n_jobs=-1,cv=4)
reg.fit(X,y)
ans = reg.predict(X_test)
reg.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=16, p=2,
                    weights='uniform')

In [67]:
score(ans,y_test)

0.6917030328049012

In [77]:
scaler = StandardScaler()
poly = PolynomialFeatures(3)


X = scaler.fit_transform(df_train[selected_cols])
X_2 = poly.fit_transform(X)
y = df_train['Fees']
X_test = scaler.transform(df_test[selected_cols])
X_test_2 = poly.transform(X_test)

neigh = KNeighborsRegressor()
parameters = {'n_neighbors':np.arange(2,20)}

reg = GridSearchCV(neigh,param_grid=parameters,scoring=scorer, n_jobs=-1,cv=4)
reg.fit(X_2,y)



GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(score), verbose=0)

In [78]:
ans = reg.predict(X_2)
print("Traing data score {}".format(score(ans,y)))

Traing data score 0.810330433916746


## Predicted fees with KNN Regressor

In [80]:
ans = reg.predict(X_test_2)

In [81]:
s = pd.Series(ans)
s.values[:10]

array([217.22222222, 319.44444444, 344.44444444, 311.11111111,
       244.44444444, 236.11111111, 363.88888889, 227.77777778,
       358.33333333, 250.        ])