In [4]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv("../data/ReviewData100k.csv")

In [6]:
filterd_data = data[1:50000]

In [7]:
filterd_data = filterd_data[["text","stars"]]

In [8]:
filterd_data = filterd_data[filterd_data.text.isnull() != True]

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

transformer = TfidfVectorizer()

In [10]:
train_data = transformer.fit_transform(filterd_data["text"])
y = filterd_data["stars"]

## Evaluation metric - rmse

In [11]:
def rmse(pred,labels):
    return np.sqrt(np.mean((pred - labels) ** 2))

## Split train and test data

In [12]:
RANDOM_STATE = 2016

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.33, random_state=RANDOM_STATE)

## Baseline model

In [63]:
y_pred_base = [np.mean(y_train)] * len(y_test)

In [65]:
print("Baseline model accuracy: ", rmse(y_pred_base,y_test))


Baseline model accuracy:  1.36607872514


## Linear regression

In [66]:
from sklearn.linear_model import LinearRegression

In [67]:
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [68]:
y_pred_lr = lr_model.predict(X_test)

In [None]:
print("Ridge model accuracy: ", rmse(y_pred_lr,y_test))

## Ridge and Lasso regression

In [70]:
from sklearn.linear_model import RidgeCV,LassoCV

In [None]:
ridge_model = RidgeCV(alphas=[0.01,0.05,0.10,0.20,0.50,1])
lasso_model = LassoCV(alphas=[0.01,0.05,0.10,0.20,0.50,1])

In [None]:
ridge_model.fit(X_train,y_train)
lasso_model.fit(X_train,y_train)

In [None]:
y_pred_rm = ridge_model.predict(X_test)
y_pred_lm = lasso_model.predict(X_test)

In [None]:
print("Ridge model accuracy: ", rmse(y_pred_rm,y_test))
print("Lasso model accuracy: ",rmse(y_pred_lm,y_test))

## K nearest neighbour

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn_model5 = KNeighborsRegressor(n_neighbors=5)
knn_model10 = KNeighborsRegressor(n_neighbors=10)
knn_model50 = KNeighborsRegressor(n_neighbors=50)
#knn_model100 = KNeighborsRegressor(n_neighbors=100)

In [None]:
knn_model5.fit(X_train,y_train)
knn_model10.fit(X_train,y_train)
knn_model50.fit(X_train,y_train)
#knn_model100.fit(X_train,y_train)

In [None]:
y_pred_knn5 = knn_model5.predict(X_test)
y_pred_knn10 = knn_model10.predict(X_test)
y_pred_knn50 = knn_model50.predict(X_test)
#y_pred_knn100 = knn_model100.predict(X_test)

In [None]:
print("knn model with 5 neighbours accuracy: ", rmse(y_pred_knn5,y_test))
print("knn model with 10 neighbours accuracy: ",rmse(y_pred_knn10,y_test))
print("knn model with 50 neighbours accuracy: ",rmse(y_pred_knn50,y_test))

### Decision trees regressor

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
dt_model = DecisionTreeRegressor(random_state=RANDOM_STATE)

In [18]:
dt_model.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=2016,
           splitter='best')

In [21]:
y_pred_dt = dt_model.predict(X_test)

In [30]:
print("Decision tree accuracy: ", rmse(y_pred_dt,y_test))

Decision tree accuracy:  1.38044788516


### Random forest regressor

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [34]:
rf_model = RandomForestRegressor(max_depth=4,n_estimators=100,max_features='sqrt',verbose=1,random_state=RANDOM_STATE)

In [35]:
rf_model.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.5s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=2016,
           verbose=1, warm_start=False)

In [36]:
y_pred_rf = rf_model.predict(X_test)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.5s finished


In [41]:
print("Randomforest with 100 estimators accuracy: ", rmse(y_pred_rf,y_test))

Randomforest with 100 estimators accuracy:  1.32411053188


## Adaboost

In [57]:
from sklearn.ensemble import AdaBoostRegressor

In [58]:
adb_model = AdaBoostRegressor(n_estimators=100,learning_rate=0.01,random_state=RANDOM_STATE,loss="square")

In [59]:
adb_model.fit(X_train,y_train)

AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='square',
         n_estimators=100, random_state=2016)

In [60]:
y_pred_adb = adb_model.predict(X_test)

In [61]:
print("Adaboost with 100 estimators accuracy: ", rmse(y_pred_adb,y_test))

Adaboost with 100 estimators accuracy:  1.2483494767


### Gradient boosting

In [50]:
from sklearn.ensemble import GradientBoostingRegressor

In [51]:
gbm_model = GradientBoostingRegressor(n_estimators=100,learning_rate=0.01,
                                      random_state=RANDOM_STATE,max_depth=4,max_features="sqrt")

In [52]:
gbm_model.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=2016,
             subsample=1.0, verbose=0, warm_start=False)

In [55]:
y_pred_gbm = gbm_model.predict(X_test.todense())

In [56]:
print("GBM with 100 estimators accuracy: ", rmse(y_pred_gbm,y_test))

GBM with 100 estimators accuracy:  1.32058420481
