In [7]:
# Regression
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error as mse

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [8]:
import os
print(os.listdir('../input'))

['std-drug']


In [9]:
train = pd.read_csv("../input/std-drug/train.csv")
test = pd.read_csv("../input/std-drug/test.csv")

In [10]:
min_rating = train.effectiveness_rating.min()
max_rating = train.effectiveness_rating.max()

def scale_rating(rating):
    # Sacling from (1,10) to (0,5) and then replacing 0,1,2 in ratings with 0 (poor) and 3,4,5 with 1 (good).
    rating -= min_rating
    rating = rating/(max_rating - 1)
    rating *= 3
    rating = int(round(rating,0))
    return rating

train['new_effect_score'] = train.effectiveness_rating.apply(scale_rating)
test['new_effect_score'] = test.effectiveness_rating.apply(scale_rating)

In [12]:
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
train["review_by_patient"] = train["review_by_patient"].apply(lambda x: clean_text(x))


In [40]:
test["review_by_patient"] = test["review_by_patient"].apply(lambda x: clean_text(x))

In [42]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
train["sentiments"] = train["review_by_patient"].apply(lambda x: sid.polarity_scores(x))
test["sentiments"] = test["review_by_patient"].apply(lambda x: sid.polarity_scores(x))

# reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)


In [43]:
train.sentiments[0]

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [59]:
# train['neg_sentiment'] = train.sentiments.apply(lambda x:round(x['neg']))
train['pos_sentiment'] = train.sentiments.apply(lambda x:round(x['pos']))
# train['neu_sentimant'] = train.sentiments.apply(lambda x:round(x['neu']))

test['neg_sentiment'] = test.sentiments.apply(lambda x:round(x['neg']))
test['pos_sentiment'] = test.sentiments.apply(lambda x:round(x['pos']))
test['neu_sentimant'] = test.sentiments.apply(lambda x:round(x['neu']))

In [60]:
train.head()

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,new_effect_score,sentiments,neg_sentiment,pos_sentiment,neu_sentimant
0,206461,Valsartan,Left Ventricular Dysfunction,side effect take combination bystolic mg fish oil,9,20-May-12,27,8.022969,3,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0,0,1
1,95260,Guanfacine,ADHD,son halfway fourth week intuniv become concern...,8,27-Apr-10,192,7.858458,2,"{'neg': 0.035, 'neu': 0.767, 'pos': 0.198, 'co...",0,0,1
2,92703,Lybrel,Birth Control,use take another oral contraceptive pill cycle...,5,14-Dec-09,17,6.341969,1,"{'neg': 0.044, 'neu': 0.822, 'pos': 0.134, 'co...",0,0,1
3,35696,Buprenorphine / naloxone,Opiate Dependence,suboxone completely turn life around feel heal...,9,27-Nov-16,37,6.590176,3,"{'neg': 0.119, 'neu': 0.582, 'pos': 0.299, 'co...",0,0,1
4,155963,Cialis,Benign Prostatic Hyperplasia,day start work rock hard erection however expe...,2,28-Nov-15,43,6.144782,0,"{'neg': 0.265, 'neu': 0.593, 'pos': 0.143, 'co...",0,0,1


In [61]:
feat = train.columns.to_list()
target='base_score'
delete = ['patient_id','name_of_drug','use_case_for_drug','review_by_patient','drug_approved_by_UIC',target,'sentiments','neg_sentiment','neu_sentimant']
for i in delete:
    feat.remove(i)

In [64]:
train.corr()

Unnamed: 0,patient_id,effectiveness_rating,number_of_times_prescribed,base_score,new_effect_score,neg_sentiment,pos_sentiment,neu_sentimant
patient_id,1.0,0.018803,0.017499,0.010924,0.017318,-0.000675,0.001931,-0.006971
effectiveness_rating,0.018803,1.0,0.242224,0.412301,0.98098,-0.090975,0.087138,-0.002984
number_of_times_prescribed,0.017499,0.242224,1.0,0.134295,0.240236,-0.006126,-0.016375,-0.038959
base_score,0.010924,0.412301,0.134295,1.0,0.422065,-0.047388,0.045599,-0.003828
new_effect_score,0.017318,0.98098,0.240236,0.422065,1.0,-0.088844,0.085392,-0.003546
neg_sentiment,-0.000675,-0.090975,-0.006126,-0.047388,-0.088844,1.0,-0.018395,-0.277303
pos_sentiment,0.001931,0.087138,-0.016375,0.045599,0.085392,-0.018395,1.0,-0.386478
neu_sentimant,-0.006971,-0.002984,-0.038959,-0.003828,-0.003546,-0.277303,-0.386478,1.0


In [65]:
# from sklearn.preprocessing import MinMaxScaler
# scalar = MinMaxScaler()
# X=scalar.fit_transform(train[feat])
# Y = train[target].values.reshape(-1,1)

In [66]:
def baseliner(X,Y):
    print("Baseliner Models(All)")
    eval_dict = {}
    models = [
#         KNeighborsRegressor(), GaussianNB(), 
        lgb.LGBMRegressor(), ExtraTreesRegressor(), xgb.XGBRegressor(objective='reg:squarederror'), 
        cat.CatBoostRegressor(verbose=0), GradientBoostingRegressor(), RandomForestRegressor(), 
        LinearRegression(), DecisionTreeRegressor(), ExtraTreeRegressor(), AdaBoostRegressor(), 
#         BaggingRegressor(), ElasticNet(), Lasso(), Ridge(),SVR(),
        ]  

    print("sklearn Model Name  \t  rmse")
    print("--" * 50)
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=42)
    for i in models:
        model = i
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        model_name =str(i).split("(")[0]
        result=np.sqrt(mse(y_test,y_pred))
        print(f"{model_name} \t {result}")
baseliner(train[feat],train[target])

Baseliner Models(All)
sklearn Model Name  	  rmse
----------------------------------------------------------------------------------------------------
LGBMRegressor 	 0.20095215267981803
ExtraTreesRegressor 	 0.19108259884969508
XGBRegressor 	 0.1036262445166938
<catboost.core.CatBoostRegressor object at 0x7f3334ca8c18> 	 0.1450782188846404
GradientBoostingRegressor 	 0.5546314616331539
RandomForestRegressor 	 0.17014818973376164
LinearRegression 	 1.443646954203997
DecisionTreeRegressor 	 0.17394073455557982
ExtraTreeRegressor 	 0.23479032671114147
AdaBoostRegressor 	 1.1280279294272682


## Hyperopt for Better Parameters

In [67]:
from hyperopt import hp
import numpy as np
from sklearn.metrics import mean_squared_error


# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     500,
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

# LightGBM parameters
lgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     100,
}
lgb_fit_params = {
    'eval_metric': 'l2',
    'early_stopping_rounds': 10,
    'verbose': False
}
lgb_para = dict()
lgb_para['reg_params'] = lgb_reg_params
lgb_para['fit_params'] = lgb_fit_params
lgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     hp.choice('learning_rate',     np.arange(0.05, 0.31, 0.05)),
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':      100,
    'eval_metric':       'RMSE',
}
ctb_fit_params = {
    'early_stopping_rounds': 10,
    'verbose': False
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


In [68]:
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials


class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def lgb_reg(self, para):
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}


In [None]:
x_train, x_test, y_train, y_test = train_test_split(train[feat],train[target], test_size=0.3, random_state=42)
obj = HPOpt(x_train, x_test, y_train, y_test)
xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=200)
lgb_opt = obj.process(fn_name='lgb_reg', space=lgb_para, trials=Trials(), algo=tpe.suggest, max_evals=200)
ctb_opt = obj.process(fn_name='ctb_reg', space=ctb_para, trials=Trials(), algo=tpe.suggest, max_evals=200)

 94%|█████████▍| 189/200 [09:38<00:41,  3.79s/trial, best loss: 0.06063439703523072]

## Stacking

In [None]:
from sklearn.ensemble import StackingRegressor
# Creating First Layer
base_learners = [
    ('lgbm',lgb.LGBMRegressor(boosting_type='gbdt',
                        num_leaves=300,
                        max_depth=16,
                        learning_rate = 0.5,
                        n_estimators=400,
                         subsample_for_bin=200000,
#                          class_weight=,
#                          min_split_gain=0.01,
#                          min_child_weight=0.1,
                         objective=None,
                         min_child_samples=20,
                         subsample=1.0,
                         subsample_freq=0,
                         colsample_bytree=1.0,
                         reg_alpha=0.0,
                         reg_lambda=0.0,
                         random_state=None,
                         n_jobs=-1,
                         silent=True
                        )),
]
# Initializating Stacking Regressor with the meta lerner
reg = StackingRegressor(estimators=base_learners,final_estimator=xgb.XGBRegressor(objective='reg:squarederror',n_estimators=500,max_depth=8,learning_rate=0.25,colsample_bytree=0.7,min_child_weight=1,subsample=0.8862306230110487))
train_X, test_X, train_y, test_y = train_test_split(train[feat],train[target], test_size=0.3, random_state=42)
reg.fit(train_X,train_y)
y_pred = reg.predict(test_X)
print(np.sqrt(mse(test_y,y_pred)))


In [None]:
lgbm = lgb.LGBMRegressor(boosting_type='gbdt',
                        num_leaves=300,
                        max_depth=16,
                        learning_rate = 0.5,
                        n_estimators=400,
                         subsample_for_bin=200000,
#                          class_weight=,
#                          min_split_gain=0.01,
#                          min_child_weight=0.1,
                         objective=None,
                         min_child_samples=20,
                         subsample=1.0,
                         subsample_freq=0,
                         colsample_bytree=1.0,
                         reg_alpha=0.0,
                         reg_lambda=0.0,
                         random_state=None,
                         n_jobs=-1,
                         silent=True
                        )
lgbm.fit(train[feat],train[target])

In [None]:
train_X, test_X, train_y, test_y = train_test_split(train[feat],train[target], test_size=0.3, random_state=42)
y_pred = lgbm.predict(test_X)
print(np.sqrt(mse(test_y,y_pred)))

In [50]:
train_X, test_X, train_y, test_y = train_test_split(train[feat],train[target], test_size=0.3, random_state=42)

my_model = xgb.XGBRegressor(objective='reg:squarederror',n_estimators=500,max_depth=8,learning_rate=0.25,colsample_bytree=0.7,min_child_weight=1,subsample=0.8862306230110487)
my_model.fit(train[feat],train[target],early_stopping_rounds=10, eval_set=[(test_X, test_y)], verbose=1)
y_pred = my_model.predict(test_X)
print(np.sqrt(mse(test_y,y_pred)))

[0]	validation_0-rmse:4.81132
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:3.68830
[2]	validation_0-rmse:2.84086
[3]	validation_0-rmse:2.21006
[4]	validation_0-rmse:1.75853
[5]	validation_0-rmse:1.42552
[6]	validation_0-rmse:1.19061
[7]	validation_0-rmse:1.03607
[8]	validation_0-rmse:0.86784
[9]	validation_0-rmse:0.77743
[10]	validation_0-rmse:0.71657
[11]	validation_0-rmse:0.67998
[12]	validation_0-rmse:0.63143
[13]	validation_0-rmse:0.57519
[14]	validation_0-rmse:0.53912
[15]	validation_0-rmse:0.53299
[16]	validation_0-rmse:0.48636
[17]	validation_0-rmse:0.48416
[18]	validation_0-rmse:0.45582
[19]	validation_0-rmse:0.45482
[20]	validation_0-rmse:0.40984
[21]	validation_0-rmse:0.40297
[22]	validation_0-rmse:0.38836
[23]	validation_0-rmse:0.38827
[24]	validation_0-rmse:0.38778
[25]	validation_0-rmse:0.37122
[26]	validation_0-rmse:0.36517
[27]	validation_0-rmse:0.35717
[28]	validation_0-rmse:0.35687
[29]	validation_0-rmse:0.35685
[30]	validation

[258]	validation_0-rmse:0.03048
[259]	validation_0-rmse:0.03028
[260]	validation_0-rmse:0.03025
[261]	validation_0-rmse:0.03025
[262]	validation_0-rmse:0.03015
[263]	validation_0-rmse:0.03015
[264]	validation_0-rmse:0.03015
[265]	validation_0-rmse:0.03009
[266]	validation_0-rmse:0.03004
[267]	validation_0-rmse:0.03004
[268]	validation_0-rmse:0.03004
[269]	validation_0-rmse:0.02861
[270]	validation_0-rmse:0.02859
[271]	validation_0-rmse:0.02857
[272]	validation_0-rmse:0.02857
[273]	validation_0-rmse:0.02857
[274]	validation_0-rmse:0.02857
[275]	validation_0-rmse:0.02854
[276]	validation_0-rmse:0.02727
[277]	validation_0-rmse:0.02725
[278]	validation_0-rmse:0.02726
[279]	validation_0-rmse:0.02726
[280]	validation_0-rmse:0.02726
[281]	validation_0-rmse:0.02726
[282]	validation_0-rmse:0.02724
[283]	validation_0-rmse:0.02724
[284]	validation_0-rmse:0.02723
[285]	validation_0-rmse:0.02646
[286]	validation_0-rmse:0.02646
[287]	validation_0-rmse:0.02646
[288]	validation_0-rmse:0.02645
[289]	va

In [51]:
y_pred = my_model.predict(test[feat])
ans=[]
for i in range(len(y_pred)):
    ans.append(round(y_pred[i],2))
sub = test['patient_id']
sub = pd.DataFrame(sub)
sub['base_score']=ans

In [52]:
#Creating Link in Kaggle
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

def create_download_link(df, title = "Download CSV file", filename = "std_20.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(sub)