In [66]:
#Best Library for Regression

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os

%matplotlib inline

plt.rcParams['figure.figsize'] = [15, 8]
plt.style.use("fivethirtyeight")

import warnings
warnings.simplefilter('ignore')

pd.options.display.max_rows = 500
pd.options.display.max_columns = 500
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split, TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_squared_error

# Regression
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [49]:
import keras
import nltk
import pandas as pd
import numpy as np
import re
import codecs

In [50]:
def metric(y,y0):
    assert len(y)==len(y0)
    return 100-np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0),2)))

def metric_lgb(y_pred,data):
    y_true = data.get_label()
    result = metric(y_true,y_pred)
    
    return '100-rmse', result,True

In [51]:
train_1 = pd.read_csv("train.csv")
test_1 = pd.read_csv("test.csv")

# print("Train Shape : {}\nTest Shape: {}\nSub Shape: {}".format(train_1.shape, test_1.shape))

In [52]:
train = train_1.copy()
test = test_1.copy()

In [53]:
train.nunique()

patient_id                    32165
name_of_drug                   2220
use_case_for_drug               636
review_by_patient             30121
effectiveness_rating             10
drug_approved_by_UIC           3537
number_of_times_prescribed      303
base_score                     1383
dtype: int64

In [54]:
del train['patient_id']

In [55]:
train.head(3)

Unnamed: 0,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969


In [56]:
# count=0
# li=[]
# for i in range(0,len(train['base_score'])+1):
#     if train['number_of_times_prescribed'][i]>50:
# #         print(f"{train['base_score'][i]}:::{train['review_by_patient'][i]}:::{train['number_of_times_prescribed'][i]}")
#         print(f"{train['base_score'][i]}")
#         print("===========================================")
#         if(train['effectiveness_rating'][i]<=8):
#               count+=1

In [57]:
# feartures = train.columns.to_list()
# feartures.remove('review_by_patient')
# for i in feartures:
#     plt.scatter(train[i],train['base_score'])
#     plt.xlabel(xlabel=i)
#     plt.show()

## Standardize

In [58]:
def standardize_test(df,text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df[text_field]
train['review_by_patient'] = standardize_test(train,'review_by_patient')
# train.head()

In [59]:
train.columns

Index(['name_of_drug', 'use_case_for_drug', 'review_by_patient',
       'effectiveness_rating', 'drug_approved_by_UIC',
       'number_of_times_prescribed', 'base_score'],
      dtype='object')

## TFIDF Bag of Words

In [60]:
count_vectorizer = TfidfVectorizer(
    analyzer="word", tokenizer=nltk.word_tokenize,
    preprocessor=None, stop_words='english', max_features=None)    

tfidf = count_vectorizer.fit_transform(train['review_by_patient'])

len(count_vectorizer.get_feature_names())

27178

In [62]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=25, n_iter=25, random_state=12)
truncated_tfidf = svd.fit_transform(tfidf)

In [63]:
truncated_tfidf

array([[ 0.10143059, -0.07417381,  0.03708342, ..., -0.00696463,
         0.02249146,  0.02989441],
       [ 0.24178098, -0.1367936 , -0.02117877, ..., -0.0293798 ,
        -0.00274419, -0.00902404],
       [ 0.32208506, -0.05752693, -0.09693579, ...,  0.04390738,
        -0.0616126 , -0.03096008],
       ...,
       [ 0.3008517 , -0.10247384,  0.00103718, ..., -0.00585039,
         0.01671101,  0.00881773],
       [ 0.16648196, -0.07651745, -0.00513402, ...,  0.04045764,
         0.00814793,  0.01151589],
       [ 0.12433429, -0.07350523,  0.00203432, ..., -0.05505755,
        -0.05716556,  0.03068133]])

In [64]:
def evaluate_features(X, y, clf=None):
    """General helper function for evaluating effectiveness of passed features in ML model
    
    Prints out Log loss, accuracy, and confusion matrix with 3-fold stratified cross-validation
    
    Args:
        X (array-like): Features array. Shape (n_samples, n_features)
        
        y (array-like): Labels array. Shape (n_samples,)
        
        clf: Classifier to use. If None, default Log reg is use.
    """
    if clf is None:
        clf = LogisticRegression()
    
    probas = cross_val_predict(clf, X, y, cv=StratifiedKFold(random_state=8), 
                              n_jobs=-1, method='predict_proba', verbose=2)
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print('Accuracy: {}'.format(accuracy_score(y, preds)))
    skplt.plot_confusion_matrix(y, preds)

In [67]:
evaluate_features(truncated_tfidf, train['base_score'].values.ravel())


NameError: name 'cross_val_predict' is not defined

In [18]:
train['new'] = X_test_counts[1]

In [19]:
train

Unnamed: 0,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score,new
0,Valsartan,Left Ventricular Dysfunction,"""it has no side effect, i take it in combinati...",9,20-May-12,27,8.022969,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
1,Guanfacine,ADHD,"""my son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
2,Lybrel,Birth Control,"""i used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
3,Buprenorphine / naloxone,Opiate Dependence,"""suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
4,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
...,...,...,...,...,...,...,...,...
32160,Cymbalta,Anxiety,"""i have been taking cymbalta for 15 months now...",9,10-Jun-13,89,6.963020,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
32161,Nexplanon,Birth Control,"""i have had the nexplanon since dec 27, 2016 ...",6,6-Apr-17,0,0.899076,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
32162,Venlafaxine,Panic Disorde,"""had panic attacks and social anxiety starting...",9,10-Nov-16,25,6.241812,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."
32163,Fluoxetine,Obsessive Compulsive Disorde,"""i have been off prozac for about 4 weeks now ...",8,21-Jan-15,22,7.940428,"(0, 10)\t2\n (0, 369)\t1\n (0, 2095)\t1\n ..."


In [None]:
modelxg = RandomForestRegressor(n_estimators=500)
modelxg.fit(X_train_counts, y_train)

y_predicted_counts = modelxg.predict(X_test_counts)