In [1]:
import sys
import pandas as pd
sys.path.append('../scripts')

In [2]:
data = pd.read_hdf('../data/test_train.hdf')

In [3]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [6]:
from string import punctuation

def get_text_and_labels(data):
    """

    Returns
    -------

    """
    data.loc[data.stars.isin([1,2]), 'target'] = -1
    data.loc[data.stars.isin([4,5]), 'target'] = 1
    data.loc[data.stars.isin([3]), 'target'] = 0
    data = data[['text', 'target']]
    for p in punctuation + ' ':
        data.text = data.text.str.replace(p, '')

    return data

In [9]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
data.to_hdf('../data/test_train.hdf', 'df')

# Cleaning the Data

In [13]:
data.text[:2]

0         this place was delicious  my parents saw a rec...
1         cant miss stop for the best fish sandwich in p...
2         this place should have a lot more reviews  but...
3         this place was very good i found out about emi...
4         old schooltraditional mom n pop quality and pe...
5         seen this restaurant on 25 best places in pitt...
6         wonderful reuben  map shown on yelp page is in...
7                                        good fish sandwich
8         after a morning of thrift store hunting a frie...
9         this is a hidden gem no really it took us fore...
10        this would be my local haunt if i were ever to...
11        a great townie bar with tasty food and an inte...
12        alexions has been around forever  the first ti...
13        good for cheap drinks and wings they offer dai...
14        what a cool barrestaurant i will no doubt be v...
15        awesome drink specials during happy hour fanta...
16        good beer selection understaff

In [None]:
data.text = data.text.str.lower()

## Transforming the predictor

In [None]:
vect = TfidfVectorizer()
X = vect.fit_transform(data.text)

## Finding the best paramterization of SVC

In [None]:
params = {"tfidf__ngram_range": [(1, 1), (1, 2)],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([("tfidf", TfidfVectorizer(sublinear_tf=True)),
                ("svc", LinearSVC())])

gs = GridSearchCV(clf, params, verbose=2, n_jobs=-1)

In [None]:
gs.fit(data.text, data.target)
print(gs.best_estimator_)
print(gs.best_score_)

In [None]:
sample_X = X[:10000]

In [None]:
sample_y = data.target[:10000]

In [None]:
pred_y = model.predict(X[10000:20000])

In [None]:
len(pred_y), len(data.target[10000:20000])

In [None]:
model.score(X[10000:20000], data.target[10000:20000])

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(data.target[10000:20000],pred_y)

In [None]:
cm = confusion_matrix(data.target[10000:20000],pred_y)

## Making the Confusion Matrix

In [None]:
%pylab inline
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(data.target.unique()))
    plt.xticks(tick_marks, [-1,0,1], rotation=45)
    plt.yticks(tick_marks, [-1,0,1])
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(cm)

In [None]:
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

In [None]:
plot_confusion_matrix(cm_normalized)

# Making the Model work regularly

1. Train the tf-idf vector on the entire dataset
2. Make an imputer that is fit to the entire dataset
3. vectorize the train and the test sets
4. transform the test and the train sets with the imp.trasnform funciton
5. Build the model
6. Predict the thing

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split

This was uneccessary. It should have been just `fit_transform` on the 

In [16]:
vect = TfidfVectorizer()
X = vect.fit_transform(data.text)

In [None]:
imp.fit(X)
imp = Imputer(missing_values='NaN', strategy='median', axis=0)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data.text, data.target, 
                                                    test_size=0.1, random_state=123)

In [26]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

Should have made sure that the thing worked above and that the function was working alright

In [34]:
assert X_train.shape[1] ==  X_test.shape[1]

In [35]:
X_train

<891564x540472 sparse matrix of type '<class 'numpy.float64'>'
	with 69215180 stored elements in Compressed Sparse Row format>

# Trying Random Forests

In [36]:
def train_rf_reg(X, y):
    rf = RandomForestRegressor(n_estimators=100, max_features='sqrt',
                               n_jobs=-1, min_samples_leaf=4)
    return rf.fit(X, y)

In [37]:
from sklearn.externals import joblib

In [38]:
model = train_rf_reg(X_train, y_train)
joblib.dump(model, '../model/rf.pkl');

finished vectorizing


['../model/rf.pkl',
 '../model/rf.pkl_01.npy',
 '../model/rf.pkl_02.npy',
 '../model/rf.pkl_03.npy',
 '../model/rf.pkl_04.npy',
 '../model/rf.pkl_05.npy',
 '../model/rf.pkl_06.npy',
 '../model/rf.pkl_07.npy',
 '../model/rf.pkl_08.npy',
 '../model/rf.pkl_09.npy',
 '../model/rf.pkl_10.npy',
 '../model/rf.pkl_11.npy',
 '../model/rf.pkl_12.npy',
 '../model/rf.pkl_13.npy',
 '../model/rf.pkl_14.npy',
 '../model/rf.pkl_15.npy',
 '../model/rf.pkl_16.npy',
 '../model/rf.pkl_17.npy',
 '../model/rf.pkl_18.npy',
 '../model/rf.pkl_19.npy',
 '../model/rf.pkl_20.npy',
 '../model/rf.pkl_21.npy',
 '../model/rf.pkl_22.npy',
 '../model/rf.pkl_23.npy',
 '../model/rf.pkl_24.npy',
 '../model/rf.pkl_25.npy',
 '../model/rf.pkl_26.npy',
 '../model/rf.pkl_27.npy',
 '../model/rf.pkl_28.npy',
 '../model/rf.pkl_29.npy',
 '../model/rf.pkl_30.npy',
 '../model/rf.pkl_31.npy',
 '../model/rf.pkl_32.npy',
 '../model/rf.pkl_33.npy',
 '../model/rf.pkl_34.npy',
 '../model/rf.pkl_35.npy',
 '../model/rf.pkl_36.npy',
 '../mod

In [24]:
model = joblib.load('../model/rf.pkl')

In [39]:
preds = model.predict(X_test)