In [1]:
# 在SVM(在regression的案例中，使用SVR)
# 在SVM中，要求每個feature最好有同樣的scale，所以需要進行標準化，常用的方式是將值標準化為在0到1之間(MinMaxScaler)
# 在SVM中，要提高features的維度，主要有兩種方法：polynomial kernel，或是使用radial basis function(RBF) kernel
# 可以fine tune的參數則是C與gamma
# C就像是ridge function或是lasso function中討論的L2與L1的regularization
# gamma指的是每一個資料點，所影響的範圍，如果gamma越小，則每個點影響範圍越廣，比較gerneralization，趨向underfitting
# C則是越大的話，則模型會趨向複雜，overfitting
# 在SVM裡面，也可以使用dummy variables

# 在ridge function中使用的是alpha參數，當alpha越大時，coefficient會更趨近於0，相反的越小的alpha，則模型更複雜，趨向overfitting
# 而在lasso function中，則需要注意到，當我們將alpha調小時，也需要同時增加max_iter(the maximum number of iterations)的參數才行, 
# 並且在lasso模型中，將alpha調小，是往overfitting的方向趨近
# 一般來說，實務上第一個會嘗試的是ridge function，除非是features數量很多，需要削減，才會使用lasso function

* [step0](#step0): import necessary packages
* [step1](#step1): import `dataset X_remaining50.pickle` as `X_remaining50`

In [1]:
# import necessary packages
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # module for missing value visualization
from scipy import stats # implement box-cox transformation
from math import ceil
from sklearn.utils import shuffle # shuffling the dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB # for sentiment analysis benchmark model
from sklearn.model_selection import cross_val_score # cross validation score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR

from scipy.stats import uniform
from numpy import flatnonzero # return the index for nonzero value

# Pretty display for notebooks
%matplotlib inline
pd.options.display.max_columns = None # show up all column values in display

# suppress warning
import warnings
warnings.simplefilter("ignore")

# suppress scientific notation
np.set_printoptions(suppress=True)

<a id="step1"></a>
## step1: import dataset `X_remaining50.pickle` as `X_remaining50`

In [2]:
X_remaining50 = pd.read_pickle("X_remaining50.pickle")

<a id="step2"></a>
## step2: select the relevant predictors
1. the target variable is `transformed_score`.
2. based on the correlation matrix showed up in part3 and part4, `Review_Total_Negative_Word_Counts`, `Review_Total_Positive_Word_Counts`, `quarter_transformed_score`, `quarter_previous_transformed_score` have better correlation with `transformed_score`.
3. in order to apply pd.get_dummies(), I first replace the value 0/1 in `bayes_predict_review_sentiment` to string.
4. drop out the NA in the rows, so that I won't face bugs when applying MinMaxScaler()
5. set up the dummies for the dataset.

In [22]:
# select the relevant predictors
cols = ["transformed_score", "Average_Score",
        "Review_Total_Negative_Word_Counts","Review_Total_Positive_Word_Counts",
        "quarter_transformed_score","quarter_previous_transformed_score",
        "quarter_change_rate","bayes_predict_review_sentiment"]

X_remaining_sub = X_remaining50[cols]

In [23]:
# map 0/1 in bayes_predict_review_sentiment to negative/positive sentiment
value_replace = {0:"negative",
                 1:"positive"}

X_remaining_sub["bayes_predict_review_sentiment"] = X_remaining_sub["bayes_predict_review_sentiment"].map(value_replace)


In [24]:
# drop out row contains NA
X_remaining_sub.dropna(inplace=True)

In [25]:
# convert categorical feature into dummies
X_remaining_sub = pd.get_dummies(X_remaining_sub)

<a id="step3"></a>
## step3: shuffle and sampling  the remaining dataset
Use the remaining 2.5% of dataset as training and validation dataset for rating prediction model.

In [26]:
# separate target variable out - transformed_score
target_variable = X_remaining_sub.transformed_score

# drop out the target variable in X dataset
X_train = X_remaining_sub.drop(["transformed_score"], axis=1)

# use the remaining 50% of the whole dataset - use train_test_split() to achieve same result
X_first, X_remaining, y_first, y_remaining = train_test_split(X_train, target_variable,
                                                              test_size = 0.95, random_state=20)

<a id="step4"></a>
## step4: create lasso model as benchmark model

In [27]:
# create a Lasso model
lasso = Lasso()

# set up the parameter range for grid-search
param_grid = {"alpha":[80,50,20,15,10,5,3,2,1,0.5,0.1,0.001],
              "max_iter":[10000,5000,1000,500,100]}

scorer = make_scorer(r2_score, greater_is_better=True)

grid = GridSearchCV(lasso, param_grid=param_grid, scoring=scorer, cv=5)

grid.fit(X_first,y_first)

print(grid.best_score_)


0.39510146385041484


In [28]:
# the argument setting for best estimator
grid.best_estimator_

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [29]:
# the estimator's performance on the remaining 40% dataset
grid.score(X_remaining40, y_remaining40)

0.3920149477267595

In [30]:
# filter out the nonzero coefficients
nonzero_index = flatnonzero(grid.best_estimator_.coef_) # return the index for nonzero coefficients
nonzero_feature_name = X_first10.columns[nonzero_index] # index out the feature name
nonzero_coef = grid.best_estimator_.coef_[nonzero_index] # index out the coefficient's value

In [31]:
# display the coefficients as sorted dataframe
value = nonzero_coef[nonzero_coef.argsort()[::-1]]
feature = nonzero_feature_name[nonzero_coef.argsort()[::-1]]
lasso_coefficient = pd.DataFrame({"feature":feature, "value":value})
display(lasso_coefficient)

Unnamed: 0,feature,value
0,Review_Total_Positive_Word_Counts,1.406859
1,quarter_transformed_score,0.7435925
2,bayes_predict_review_sentiment_positive,9.515553e-13
3,quarter_previous_transformed_score,-0.02598417
4,Review_Total_Negative_Word_Counts,-1.475442
5,quarter_change_rate,-1.595652
6,bayes_predict_review_sentiment_negative,-143.0481


<a id="step5"></a>
## step5: create support vector regression model
1. later I will use Support Vector Machine, it requires to have smimilar scale on all features. For I have dummy variables(0/1), I will use MinMaxScaler() to scale all numeric features into range 0/1 as well. 
2. in the following model evaluation, for the proper use of train and validation dataset in corss-validation, it's better to create a pipeline for it.
3. for each fold of cross-validation, pipeline enables to use the training data within current fold only to create scaler. It avoids data information leakage to validation dataset.
4. for support vector machine, the observations of dataset highly recommended not to go beyond 100,000 rows, the time it takes for training the model goes exponentially.

In [32]:
# create a randomized pipeline
svr_pipe = Pipeline([("scaler", MinMaxScaler()),("svr", SVR())])


# set up the parameter range for grid-search
param_grid = {"svr__C":uniform(0,10), # use distributions insted (only applicable in randomized grid search)
              "svr__gamma":uniform(0,10)}

scorer = make_scorer(r2_score, greater_is_better=True)

random_grid = RandomizedSearchCV(svr_pipe, param_distributions=param_grid, # use param_distributions
                                 scoring=scorer, cv=3, n_iter=5, random_state=20)
random_grid.fit(X_first,y_first)

print(random_grid.best_score_)

0.4163486677776579


In [33]:
random_grid.best_estimator_

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svr', SVR(C=8.91530729474708, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=8.15837477307684, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))])

In [34]:
random_grid.score(X_remaining, y_remaining)

0.41633340843065847

## another version for normalizing features

In [104]:
# scale all variables including target variable
scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

num_cols = ["Review_Total_Negative_Word_Counts","Review_Total_Positive_Word_Counts",
            "quarter_transformed_score","quarter_previous_transformed_score",
            "quarter_change_rate"]

X_first30[num_cols] = scaler.fit_transform(X_first30[num_cols])

y_first30 = target_scaler.fit_transform(y_first30.reshape(-1,1))

y_first30 = y_first30.ravel()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


<a id="step6"></a>
## step6: convert the transformed_score back to original scale
1. have a look at the coefficient on each feature, to grip a sense of how the features influence the response.
2. convert the transfomred_socre back to original scale and see have a look at it.