In [None]:
#importing the required libraries
import pandas as pd
import numpy as np


**Reading all the datasets**

In [None]:
# Reading all datasets
df_train=pd.read_csv('train.csv')
size_train=df_train.shape
print("Size of training dataset : ",size_train)
df_test=pd.read_csv('test.csv')
size_test=df_test.shape
print("Size of test dataset : ",size_test)

Size of training dataset :  (32165, 8)
Size of test dataset :  (10760, 7)


**Arranging the datasets in the same oreder**

In [None]:
x_train=df_train.iloc[:,0:size_train[1]-1]
dummy=x_train.iloc[:,1:].reindex(sorted(x_train.iloc[:,1:].columns),axis=1)
x_train.drop(columns=x_train.columns[1:x_train.shape[1]],axis=1,inplace=True)
x_train=pd.concat([x_train,dummy],axis=1)
# x_train.head(20)

In [None]:
y_train=df_train.iloc[:,-1]
# y_train.head(20)

In [None]:
x_test=df_test.copy()
dummy=x_test.iloc[:,1:].reindex(sorted(x_test.iloc[:,1:].columns),axis=1)
x_test.drop(columns=x_test.columns[1:x_test.shape[1]],axis=1,inplace=True)
x_test=pd.concat([x_test,dummy],axis=1)
# x_test.head(20)

In [None]:
assert(x_train.shape[1]==x_test.shape[1])

In [None]:
boo=(all(x_test.columns==x_train.columns))
print("Let's confirm that the columns in test and train are arranged in same order: ",boo)

Let's confirm that the columns in test and train are arranged in same order:  True



**Checking the data type of each column**


In [None]:
datatype=x_train.dtypes
print(datatype)

patient_id                     int64
drug_approved_by_UIC          object
effectiveness_rating           int64
name_of_drug                  object
number_of_times_prescribed     int64
review_by_patient             object
use_case_for_drug             object
dtype: object


In [None]:
datatype=x_test.dtypes
print(datatype)

**Checking if there is any missing data**

In [None]:
x_train.isna().sum()

patient_id                    0
drug_approved_by_UIC          0
effectiveness_rating          0
name_of_drug                  0
number_of_times_prescribed    0
review_by_patient             0
use_case_for_drug             0
dtype: int64

In [None]:
x_test.isna().sum()

patient_id                    0
drug_approved_by_UIC          0
effectiveness_rating          0
name_of_drug                  0
number_of_times_prescribed    0
review_by_patient             0
use_case_for_drug             0
dtype: int64

In [None]:
y_train.isna().sum()

0

- we can see that there are no missing values in the dataframe. But there can be some weird values.

**Dealing with 'drug_approved_by_UIC' column** 

In [None]:
date_train=pd.to_datetime(x_train['drug_approved_by_UIC'])
# date_train.head(20)

In [None]:
'''We may want to drop the date column but it may have some impact on outcomes.
Let's make a new column which contains the succeding number of days for the drug 
after the approval. If we had the date on which patient has written the review
(date of prescription) then it would have added more value to find the succeding 
number of days for the drug after the approval'''
days_from_today=(pd.Timestamp('today')-date_train).dt.days
days_from_today.rename('days_of_approval_from_today',inplace=True)
x_train=pd.concat([x_train,days_from_today],axis=1)
x_train['year_of_approval'] = pd.DatetimeIndex(date_train).year
x_train.drop(columns=['drug_approved_by_UIC'],inplace=True)
# x_train.head(20)

In [None]:
date_test=pd.to_datetime(x_test['drug_approved_by_UIC'])
# date_test.head(20)

In [None]:
days_from_today2=(pd.Timestamp('today')-date_test).dt.days
days_from_today2.rename('days_of_approval_from_today',inplace=True)
x_test=pd.concat([x_test,days_from_today2],axis=1)
x_test['year_of_approval'] = pd.DatetimeIndex(date_test).year
x_test.drop(columns=['drug_approved_by_UIC'],inplace=True)
# x_test.head(20)

In [None]:
dummy_df=x_train[['name_of_drug','number_of_times_prescribed','use_case_for_drug','days_of_approval_from_today','year_of_approval']]
dummy_df.corr()

Unnamed: 0,number_of_times_prescribed,days_of_approval_from_today,year_of_approval
number_of_times_prescribed,1.0,0.276403,-0.271538
days_of_approval_from_today,0.276403,1.0,-0.994403
year_of_approval,-0.271538,-0.994403,1.0


In [None]:
x_train.drop(columns='year_of_approval',inplace=True)
x_train.head()

Unnamed: 0,patient_id,effectiveness_rating,name_of_drug,number_of_times_prescribed,review_by_patient,use_case_for_drug,days_of_approval_from_today
0,206461,9,Valsartan,27,"""It has no side effect, I take it in combinati...",Left Ventricular Dysfunction,3805
1,95260,8,Guanfacine,192,"""My son is halfway through his fourth week of ...",ADHD,4559
2,92703,5,Lybrel,17,"""I used to take another oral contraceptive, wh...",Birth Control,4693
3,35696,9,Buprenorphine / naloxone,37,"""Suboxone has completely turned my life around...",Opiate Dependence,2153
4,155963,2,Cialis,43,"""2nd day on 5mg started to work with rock hard...",Benign Prostatic Hyperplasia,2518


In [None]:
x_test.drop(columns='year_of_approval',inplace=True)
x_test.head()

Unnamed: 0,patient_id,effectiveness_rating,name_of_drug,number_of_times_prescribed,review_by_patient,use_case_for_drug,days_of_approval_from_today
0,163740,10,Mirtazapine,22,"""I&#039;ve tried a few antidepressants over th...",Depression,3887
1,39293,9,Contrave,35,"""Contrave combines drugs that were used for al...",Weight Loss,2055
2,208087,4,Zyclara,13,"""4 days in on first 2 weeks. Using on arms an...",Keratosis,3031
3,23295,7,Methadone,21,"""Ive been on Methadone for over ten years and ...",Opiate Withdrawal,2193
4,97013,2,Ambien,44,"""Ditto on rebound sleepless when discontinued....",Insomnia,2837


**Let's deal with 'name_of_drug' column**

In [None]:
x_train.nunique()

patient_id                     32165
effectiveness_rating              10
name_of_drug                    2220
number_of_times_prescribed       303
review_by_patient              30121
use_case_for_drug                636
days_of_approval_from_today     3537
year_of_approval                  10
dtype: int64

In [None]:
x_test.nunique()

patient_id                    10760
effectiveness_rating             10
name_of_drug                   1478
number_of_times_prescribed      233
review_by_patient             10530
use_case_for_drug               461
dtype: int64

In [None]:
x_train_nu=x_train['name_of_drug'].unique()
x_train_nu=pd.Series(x_train_nu)
x_test_nu=x_test['name_of_drug'].unique()
x_test_nu=pd.Series(x_test_nu)
print('number of unique values in x_train are {0} and x_test are {1} '.format(x_train_nu.shape[0],x_test_nu.shape[0]))

number of unique values in x_train are 2220 and x_test are 1478 


In [None]:
drugs_intest_notintrain=x_test_nu[~x_test_nu.isin(x_train_nu)].reset_index(drop=True)
print("count of drugs which are in test set but not in train set: ",drugs_intest_notintrain.count())
print("drugs which are in test set but not in train set", drugs_intest_notintrain)

count of drugs which are in test set but not in train set:  161
drugs which are in test set but not in train set 0               Vitamin D2
1                  Uptravi
2             Trimethoprim
3                 Bronkaid
4                     Kava
              ...         
156                Zioptan
157    Iron polysaccharide
158               Calamine
159                  Lidex
160               Glycerin
Length: 161, dtype: object


In [None]:
def calc_smooth_mean(df1,df2, cat_name, target, weight):
    # Compute the global mean
    mean = df1[target].mean()

    # Compute the number of values and the mean of each group
    agg = df1.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth),df2[cat_name].map(smooth.to_dict())

In [None]:
WEIGHT = 0.5
df_train['cat_0'],df_test['cat_0'] = calc_smooth_mean(df_train,df_test, 'name_of_drug', 'base_score', weight=WEIGHT)


In [None]:
df_train.head()

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,cat_0
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969,5.786954
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458,6.659846
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969,6.89666
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176,7.085175
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782,7.177641


In [None]:
df_test.head()

Unnamed: 0,patient_id,name_of_drug,review_by_patient,drug_approved_by_UIC,number_of_times_prescribed,use_case_for_drug,effectiveness_rating,cat_0
0,163740,Mirtazapine,"""I&#039;ve tried a few antidepressants over th...",28-Feb-12,22,Depression,10,6.673801
1,39293,Contrave,"""Contrave combines drugs that were used for al...",5-Mar-17,35,Weight Loss,9,6.555581
2,208087,Zyclara,"""4 days in on first 2 weeks. Using on arms an...",3-Jul-14,13,Keratosis,4,7.09168
3,23295,Methadone,"""Ive been on Methadone for over ten years and ...",18-Oct-16,21,Opiate Withdrawal,7,7.056544
4,97013,Ambien,"""Ditto on rebound sleepless when discontinued....",13-Jan-15,44,Insomnia,2,7.007461


**Let's deal with "review_of_patient" column**
- First let's clean the review
- Secondly construct the word2vec model
- Now, let's utilise the word2vec model to construct numerical vectors for each review using average word2vec
- Apply, the model on the test model


In [None]:
import re
from nltk.corpus import stopwords


def change_lower(text):
    text = text.lower()
    return text

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_data(text):
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:,;0123456789]')
    text=" ".join([s for s in text.split() if(regex.search(s) == None and len(s)>1)])
    text = re.sub(r'[^ \nA-Za-z0-9À-ÖØ-öø-ÿ/]+', '', text)
    text = re.sub(r'[\\/×\^\]\[÷]', '', text)
    text=text.strip()
    return text


def remove_stopwords(sentence):
    stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])
    return ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)


In [None]:
#cleaning x_train
x_train[["review_by_patient"]] = x_train[["review_by_patient"]].astype(str)
x_train["review_by_patient"] = x_train["review_by_patient"].apply(change_lower)
x_train["review_by_patient"] = x_train["review_by_patient"].apply(decontracted)
x_train["review_by_patient"] = x_train["review_by_patient"].apply(clean_data)
x_train["review_by_patient"] = x_train["review_by_patient"].apply(remove_stopwords)

In [None]:
#cleaning x_test
x_test[["review_by_patient"]] = x_test[["review_by_patient"]].astype(str)
x_test["review_by_patient"] = x_test["review_by_patient"].apply(change_lower)
x_test["review_by_patient"] = x_test["review_by_patient"].apply(decontracted)
x_test["review_by_patient"] = x_test["review_by_patient"].apply(clean_data)
x_test["review_by_patient"] = x_test["review_by_patient"].apply(remove_stopwords)

In [None]:
from gensim.models import Word2Vec
import multiprocessing
def get_w2vdf(df):
    w2v_df = pd.DataFrame(df["review_by_patient"]).values.tolist()
    for i in range(len(w2v_df)):
        w2v_df[i] = w2v_df[i][0].split(" ")
    return w2v_df

def train_w2v(w2v_df):
    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=4,
                         window=4,
                         size=300, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg = 1,
                         workers=cores-1)
    
    w2v_model.build_vocab(w2v_df, progress_per=10000)
    w2v_model.train(w2v_df, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
    return w2v_model

w2v_xtrain = get_w2vdf(x_train)
w2v_model = train_w2v(w2v_xtrain)

In [None]:
w2v_word_dict=w2v_model.wv.vocab
filename='vocab.wv'
w2v_model.save(filename)
# import gensim
# reload_model=gensim.models.Word2Vec.load(filename)
# w2v_model=reload_model
# w2v_word_dict=w2v_model.wv.vocab
# filename='vocab.wv'

In [None]:
# average Word2Vec
# compute average word2vec for each review.
# from tqdm import tqdm
def avgw2v(df):
   sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
   list_of_sentence=df
   for sent in list_of_sentence: # for each review/sentence
    sent_vec = np.zeros(300) 
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_word_dict:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
   return sent_vectors
# print(len(sent_vectors))
# print(len(sent_vectors[0]))

In [None]:
df=x_train["review_by_patient"]
sent_vectors=avgw2v(df)
n_vectors=pd.DataFrame(sent_vectors)
# n_vectors.shape
x_train=pd.concat([x_train,n_vectors],axis=1)
# x_train.drop(columns="review_by_patient",inplace=True)
x_train.head()

Unnamed: 0,patient_id,effectiveness_rating,name_of_drug,number_of_times_prescribed,review_by_patient,use_case_for_drug,days_of_approval_from_today,0,1,2,...,290,291,292,293,294,295,296,297,298,299
0,206461,9,Valsartan,27,no side take combination bystolic mg fish oil,Left Ventricular Dysfunction,3805,-0.110255,-0.02782,-0.033642,...,0.236169,0.06315,-0.116938,-0.080355,-0.074519,-0.138958,-0.090235,-0.095254,-0.153199,0.17145
1,95260,8,Guanfacine,192,son halfway fourth week intuniv became concern...,ADHD,4559,0.107336,0.096128,-0.033932,...,0.280583,-0.037494,0.008661,0.063055,0.056798,-0.077682,-0.103767,-0.082094,-0.146444,0.132826
2,92703,5,Lybrel,17,used take another oral pill happy light max no...,Birth Control,4693,0.160619,0.07651,-0.09595,...,0.26155,-0.043766,-0.017332,0.063745,0.097543,-0.129441,-0.068116,-0.105746,-0.10593,0.060586
3,35696,9,Buprenorphine / naloxone,37,suboxone completely turned life around feel ex...,Opiate Dependence,2153,0.122876,-0.023978,-0.135219,...,0.338241,0.000775,-0.113678,-0.009436,0.095907,-0.13424,-0.080595,-0.029544,-0.138908,0.06429
4,155963,2,Cialis,43,day started work rock hard erections however e...,Benign Prostatic Hyperplasia,2518,0.203464,0.196559,-0.087479,...,0.352854,-0.115493,0.047744,0.046628,0.161948,-0.056169,-0.116944,-0.147703,-0.136951,-0.014105


In [None]:
df=x_test["review_by_patient"]
sent_vectors=avgw2v(df)
n_vectors=pd.DataFrame(sent_vectors)
# n_vectors.shape
x_test=pd.concat([x_test,n_vectors],axis=1)
# x_test.drop(columns="review_by_patient",inplace=True)
x_test.head()

Unnamed: 0,patient_id,effectiveness_rating,name_of_drug,number_of_times_prescribed,review_by_patient,use_case_for_drug,days_of_approval_from_today,0,1,2,...,290,291,292,293,294,295,296,297,298,299
0,163740,10,Mirtazapine,22,tried antidepressants years none helped insomn...,Depression,3887,0.152556,0.068108,-0.10561,...,0.287971,-0.043609,-0.017173,0.067901,0.090842,-0.125863,-0.091459,-0.057227,-0.145126,0.054164
1,39293,9,Contrave,35,contrave combines drugs used opioid cessation ...,Weight Loss,2055,0.12615,0.066826,-0.068454,...,0.258906,-0.029203,-0.022157,0.0583,0.063792,-0.104099,-0.103351,-0.059566,-0.134272,0.12874
2,208087,4,Zyclara,13,days first weeks using arms face put vaseline ...,Keratosis,3031,0.133781,0.074749,-0.112215,...,0.331295,-0.037726,-0.039954,0.051468,0.08116,-0.111452,-0.102607,-0.038269,-0.167126,0.036489
3,23295,7,Methadone,21,ive methadone ten years trying get drug ive de...,Opiate Withdrawal,2193,0.197203,0.079182,-0.018894,...,0.300397,-0.051476,0.038491,0.145819,0.108014,-0.04869,-0.086066,-0.061796,-0.128536,0.190173
4,97013,2,Ambien,44,ditto rebound sleepless discontinued done stra...,Insomnia,2837,0.187322,0.052254,-0.084732,...,0.328488,-0.06077,-0.012716,0.09533,0.116993,-0.071195,-0.115258,-0.083591,-0.119004,0.144871


**Let's deal with "use_case_for_drug" column**

In [None]:
x_train_nu=x_train['use_case_for_drug'].unique()
x_train_nu=pd.Series(x_train_nu)
x_test_nu=x_test['use_case_for_drug'].unique()
x_test_nu=pd.Series(x_test_nu)
print('number of unique values in x_train are {0} and x_test are {1} '.format(x_train_nu.shape[0],x_test_nu.shape[0]))

number of unique values in x_train are 636 and x_test are 461 


In [None]:
usecase_intest_notintrain=x_test_nu[~x_test_nu.isin(x_train_nu)].reset_index(drop=True)
print("count of usecases which are in test set but not in train set: ",usecase_intest_notintrain.count())
# print("usecases which are in test set but not in train set", usecase_intest_notintrain)

In [None]:
x_train[x_train['use_case_for_drug']=='Schnitzler Syndrome']

Unnamed: 0,patient_id,effectiveness_rating,name_of_drug,number_of_times_prescribed,review_by_patient,use_case_for_drug,days_of_approval_from_today,year_of_approval


In [None]:
def clear_usecase(text):
  text=re.sub("</span> users found this comment helpful."," NA",text)
  text=re.sub(r'[0-9]',"",text).strip()
  if len(text)<2: text="NA" 
  return text

In [None]:
df_train['use_case_for_drug']=df_train['use_case_for_drug'].apply(clear_usecase)
df_test['use_case_for_drug']=df_test['use_case_for_drug'].apply(clear_usecase)

In [None]:
# dumm=df_train.groupby(["name_of_drug"]).agg(Mode=('use_case_for_drug',pd.Series.mode))
# dumm.shape

In [None]:
# def replace_text(lst):
#     print(type(lst))
#     # print(lst[0])
#     # print(lst[1])
#     lst=list(lst)
#     if lst[1]=="NA":
#       #  print("True")
#        key=lst[0]
#        value=dumm.loc[key,"Mode"]
#        d_list=[key,str(value)]
#        return pd.Series(d_list)
#     else:
#       return pd.Series(lst)

In [None]:
# df_train[['name_of_drug','use_case_for_drug']]=df_train[['name_of_drug','use_case_for_drug']].apply(replace_text)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [None]:
def calc_smooth_mean(df1,df2, cat_name, target, weight):
    # Compute the global mean
    mean = df1[target].mean()

    # Compute the number of values and the mean of each group
    agg = df1.groupby(cat_name)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + weight * mean) / (counts + weight)

    # Replace each value by the according smoothed mean
    if df2 is None:
        return df1[cat_name].map(smooth)
    else:
        return df1[cat_name].map(smooth),df2[cat_name].map(smooth.to_dict())

In [None]:
WEIGHT = 0.5
df_train['cat_1'],df_test['cat_1'] = calc_smooth_mean(df_train,df_test, 'use_case_for_drug', 'base_score', weight=WEIGHT)


**Now let's drop the column which are not useful for us after feature engineering and attach the column required**

In [None]:
x_train.drop(columns=["patient_id","name_of_drug","review_by_patient","use_case_for_drug"],inplace=True)
x_test.drop(columns=["patient_id","name_of_drug","review_by_patient","use_case_for_drug"],inplace=True)
x_train.head()

Unnamed: 0,effectiveness_rating,number_of_times_prescribed,days_of_approval_from_today,0,1,2,3,4,5,6,...,290,291,292,293,294,295,296,297,298,299
0,9,27,3805,-0.110255,-0.02782,-0.033642,0.112888,-0.025334,0.131241,0.018702,...,0.236169,0.06315,-0.116938,-0.080355,-0.074519,-0.138958,-0.090235,-0.095254,-0.153199,0.17145
1,8,192,4559,0.107336,0.096128,-0.033932,0.075664,0.033244,0.104873,0.182147,...,0.280583,-0.037494,0.008661,0.063055,0.056798,-0.077682,-0.103767,-0.082094,-0.146444,0.132826
2,5,17,4693,0.160619,0.07651,-0.09595,0.098858,0.009656,0.046,0.214038,...,0.26155,-0.043766,-0.017332,0.063745,0.097543,-0.129441,-0.068116,-0.105746,-0.10593,0.060586
3,9,37,2153,0.122876,-0.023978,-0.135219,0.118685,-0.029919,0.096626,0.19269,...,0.338241,0.000775,-0.113678,-0.009436,0.095907,-0.13424,-0.080595,-0.029544,-0.138908,0.06429
4,2,43,2518,0.203464,0.196559,-0.087479,0.078965,0.079035,0.052499,0.288632,...,0.352854,-0.115493,0.047744,0.046628,0.161948,-0.056169,-0.116944,-0.147703,-0.136951,-0.014105


In [None]:
dummy_train=df_train[['cat_0','cat_1']]
dummy_test=df_test[['cat_0','cat_1']]
x_train=pd.concat([x_train,dummy_train],axis=1)
x_test=pd.concat([x_test,dummy_test],axis=1)


In [None]:
x_train.head()

Unnamed: 0,effectiveness_rating,number_of_times_prescribed,days_of_approval_from_today,0,1,2,3,4,5,6,...,292,293,294,295,296,297,298,299,cat_0,cat_1
0,9,27,3805,-0.110255,-0.02782,-0.033642,0.112888,-0.025334,0.131241,0.018702,...,-0.116938,-0.080355,-0.074519,-0.138958,-0.090235,-0.095254,-0.153199,0.17145,5.786954,7.185018
1,8,192,4559,0.107336,0.096128,-0.033932,0.075664,0.033244,0.104873,0.182147,...,0.008661,0.063055,0.056798,-0.077682,-0.103767,-0.082094,-0.146444,0.132826,6.659846,6.736605
2,5,17,4693,0.160619,0.07651,-0.09595,0.098858,0.009656,0.046,0.214038,...,-0.017332,0.063745,0.097543,-0.129441,-0.068116,-0.105746,-0.10593,0.060586,6.89666,6.509858
3,9,37,2153,0.122876,-0.023978,-0.135219,0.118685,-0.029919,0.096626,0.19269,...,-0.113678,-0.009436,0.095907,-0.13424,-0.080595,-0.029544,-0.138908,0.06429,7.085175,7.181485
4,2,43,2518,0.203464,0.196559,-0.087479,0.078965,0.079035,0.052499,0.288632,...,0.047744,0.046628,0.161948,-0.056169,-0.116944,-0.147703,-0.136951,-0.014105,7.177641,6.682677


**Let's perform the standard scaling of the feature for better learning of model**



In [None]:
from sklearn.preprocessing import StandardScaler
 
SC= StandardScaler()
# standardization 
x_train = SC.fit_transform(x_train)
x_test=SC.transform(x_test)



**Let's apply the svm regression model for training and predicting the base_score**

In [None]:
# Tuning of parameters for regression by cross-validation
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,make_scorer

K = 3               # Number of cross validations

Parameters for tuning
parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5, 0.6, 0.9],'C': [1, 10, 100, 1000, 10000]}]
print("Tuning hyper-parameters")
scorer=make_scorer(mean_squared_error,greater_is_better=False)
model = GridSearchCV(SVR(epsilon = 0.01), parameters, cv = K,refit = True, scoring=scorer)
model.fit(x_train, y_train)
# model = SVR(C=50,epsilon = 0.01,gamma=0.01)
# model.fit(x_train, y_train)

In [None]:
base_score=model.predict(x_test)
base_score=pd.DataFrame(base_score,columns=["base_score"])

In [None]:
final=pd.concat([df_test["patient_id"],base_score])

In [None]:
final.to_csv("predictionfile.csv")