# Advanced Machine Learning Algorithms

## Non linear algorithms

#### SVM for classification

In [1]:
from sklearn.datasets import load_svmlight_file
X_train, y_train = load_svmlight_file('ijcnn1.bz2')
first_rows = 2500
X_train, y_train = X_train[:first_rows,:], y_train[:first_rows]

In [2]:
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
hypothesis = SVC(kernel='rbf', random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=5, scoring='accuracy')
print ("SVC with rbf kernel -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))



SVC with rbf kernel -> cross validation accuracy: mean = 0.910 std = 0.001


In [3]:
import pickle
covertype_dataset = pickle.load(open( "covertype_dataset.pickle", "rb" ))
covertype_X = covertype_dataset.data[:25000,:]
covertype_y = covertype_dataset.target[:25000] -1 

In [4]:
import numpy as np
covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 
        'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']
print ('original dataset:', covertype_dataset.data.shape)
print ('sub-sample:', covertype_X.shape)
print('target freq:', list(zip(covertypes,np.bincount(covertype_y))))

original dataset: (581012, 54)
sub-sample: (25000, 54)
target freq: [('Spruce/Fir', 9107), ('Lodgepole Pine', 12122), ('Ponderosa Pine', 1583), ('Cottonwood/Willow', 120), ('Aspen', 412), ('Douglas-fir', 779), ('Krummholz', 877)]


In [5]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.svm import LinearSVC
hypothesis = LinearSVC(dual=False, class_weight='balanced')
cv_strata = StratifiedKFold(covertype_y, n_folds=3, shuffle=True, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=cv_strata, scoring='accuracy')
print ("LinearSVC -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))

LinearSVC -> cross validation accuracy: mean = 0.646 std = 0.018


#### SVM for regression

In [6]:
import pickle
X_train, y_train = pickle.load(open( "cadata.pickle", "rb" ))
from sklearn.preprocessing import scale
first_rows = 2000
X_train = scale(X_train[:first_rows,:].toarray())
y_train = y_train[:first_rows]/10**4.0

In [7]:
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR
hypothesis = SVR()
scores = cross_val_score(hypothesis, X_train, y_train, cv=3,scoring='mean_absolute_error')
print ("SVR -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))

SVR -> cross validation accuracy: mean = -4.618 std = 0.347


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


#### Tuning SVM

In [8]:
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV
X_train, y_train = load_svmlight_file('ijcnn1.bz2')
first_rows = 2500
X_train, y_train = X_train[:first_rows,:], y_train[:first_rows]
hypothesis = SVC(kernel='rbf', random_state=101)
search_dict = {'C': [0.01, 0.1, 1, 10, 100], 
               'gamma': [0.1, 0.01, 0.001, 0.0001]}
search_func = RandomizedSearchCV(estimator=hypothesis, 
              param_distributions=search_dict, n_iter=10, scoring='accuracy',
              n_jobs=-1, iid=True, refit=True, cv=5, random_state=101)
search_func.fit(X_train, y_train)
print ('Best parameters %s' % search_func.best_params_)
print ('Cross validation accuracy: mean = %0.3f' % search_func.best_score_)



Best parameters {'gamma': 0.1, 'C': 100}
Cross validation accuracy: mean = 0.998


## Ensemble strategies

In [9]:
import pickle
covertype_dataset = pickle.load(open( "covertype_dataset.pickle", "rb" ))
print (covertype_dataset.DESCR)
covertype_X = covertype_dataset.data[:15000,:]
covertype_y = covertype_dataset.target[:15000]
covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 
        'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']

Forest covertype dataset.

A classic dataset for classification benchmarks, featuring categorical and
real-valued features.

The dataset page is available from UCI Machine Learning Repository

    http://archive.ics.uci.edu/ml/datasets/Covertype

Courtesy of Jock A. Blackard and Colorado State University.



####  Bagging with weak ensembles

In [10]:
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
hypothesis = BaggingClassifier(KNeighborsClassifier(n_neighbors=1), 
            max_samples=0.7, max_features=0.7, n_estimators=100)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, 
                         scoring='accuracy', n_jobs=-1)
print ("BaggingClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % 
       (np.mean(scores), np.std(scores)))

BaggingClassifier -> cross validation accuracy: mean = 0.795 std = 0.002


####  Random Forests and Extra-Trees

In [11]:
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [12]:
%%time
hypothesis = RandomForestClassifier(n_estimators=100, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, 
                         cv=3, scoring='accuracy', n_jobs=-1)
print ("RandomForestClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" % 
       (np.mean(scores), np.std(scores)))

RandomForestClassifier -> cross validation accuracy: mean = 0.809 std = 0.009
Wall time: 4.75 s


In [13]:
%%time
hypothesis = ExtraTreesClassifier(n_estimators=100, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, 
                         scoring='accuracy', n_jobs=-1)
print ("ExtraTreesClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))

ExtraTreesClassifier -> cross validation accuracy: mean = 0.821 std = 0.009
Wall time: 4.69 s


In [14]:
import numpy as np
import pickle
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
X_train, y_train = pickle.load(open( "cadata.pickle", "rb" ))
first_rows = 2000

X_train = scale(X_train[:first_rows,:].toarray())
y_train = y_train[:first_rows]/10**4.
hypothesis = RandomForestRegressor(n_estimators=300, random_state=101)
scores = cross_val_score(hypothesis, X_train, y_train, cv=3, 
                         scoring='mean_absolute_error', n_jobs=-1)
print ("RandomForestClassifier -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))

RandomForestClassifier -> cross validation accuracy: mean = -4.642 std = 0.514


#### Estimating probabilities from an ensemble

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
hypothesis = RandomForestClassifier(n_estimators=100, random_state=101)
calibration = CalibratedClassifierCV(hypothesis, method='sigmoid', cv=5)
covertype_X = covertype_dataset.data[:15000,:]
covertype_y = covertype_dataset.target[:15000]
covertype_test_X = covertype_dataset.data[15000:25000,:]
covertype_test_y = covertype_dataset.target[15000:25000]

In [16]:
hypothesis.fit(covertype_X,covertype_y)
calibration.fit(covertype_X,covertype_y)
prob_raw = hypothesis.predict_proba(covertype_test_X)
prob_cal = calibration.predict_proba(covertype_test_X)

In [17]:
tree_kind = covertypes.index('Ponderosa Pine')
probs = pd.DataFrame(list(zip(prob_raw[:,tree_kind],prob_cal[:,tree_kind])), 
        columns=['raw','calibrted'])
plot = probs.plot(kind='scatter', x=0, y=1, s=64, c='blue', edgecolors='white')

#### Sequences of models: Adaboost

In [18]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
hypothesis = AdaBoostClassifier(n_estimators=300, random_state=101)
scores = cross_val_score(hypothesis, covertype_X, covertype_y, cv=3, 
                                scoring='accuracy', n_jobs=-1)
print ("Adaboost -> cross validation accuracy: mean = %0.3f std = %0.3f" %
       (np.mean(scores), np.std(scores)))

Adaboost -> cross validation accuracy: mean = 0.622 std = 0.006


#### Gradient tree boosting (GTB)

In [19]:
import pickle
covertype_dataset = pickle.load(open( "covertype_dataset.pickle", "rb" ))
covertype_X = covertype_dataset.data[:15000,:]
covertype_y = covertype_dataset.target[:15000] -1 
covertype_val_X = covertype_dataset.data[15000:20000,:]
covertype_val_y = covertype_dataset.target[15000:20000] -1
covertype_test_X = covertype_dataset.data[20000:25000,:]
covertype_test_y = covertype_dataset.target[20000:25000] -1

In [20]:
import numpy as np
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
hypothesis = GradientBoostingClassifier(max_depth=5,n_estimators=50, random_state=101)
hypothesis.fit(covertype_X, covertype_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=101, subsample=1.0, verbose=0,
              warm_start=False)

In [21]:
from sklearn.metrics import accuracy_score
print ('GradientBoostingClassifier -> test accuracy:', 
       accuracy_score(covertype_test_y, hypothesis.predict(covertype_test_X)))

GradientBoostingClassifier -> test accuracy: 0.782


#### XGBoost

In [22]:
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import cross_val_score, StratifiedKFold
import pickle
covertype_dataset = pickle.load(open( "covertype_dataset.pickle", "rb" ))
covertype_dataset.target = covertype_dataset.target.astype(int)
covertype_X = covertype_dataset.data[:15000,:]
covertype_y = covertype_dataset.target[:15000] -1 
covertype_val_X = covertype_dataset.data[15000:20000,:]
covertype_val_y = covertype_dataset.target[15000:20000] -1
covertype_test_X = covertype_dataset.data[20000:25000,:]
covertype_test_y = covertype_dataset.target[20000:25000] -1

In [23]:
import xgboost as xgb
hypothesis = xgb.XGBClassifier(objective= "multi:softprob", max_depth = 24, gamma=0.1, subsample = 0.90,
                               learning_rate=0.01, n_estimators = 500, nthread=-1)

hypothesis.fit(covertype_X, covertype_y, eval_set=[(covertype_val_X, covertype_val_y)], 
               eval_metric='merror', early_stopping_rounds=25, verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.01, max_delta_step=0,
       max_depth=24, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=-1, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9)

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix
print ('accuracy:', accuracy_score(covertype_test_y, hypothesis.predict(covertype_test_X)))
print (confusion_matrix(covertype_test_y, hypothesis.predict(covertype_test_X)))

  if diff:


accuracy: 0.848
[[1512  288    0    0    0    2   18]
 [ 215 2197   18    0    7   11    0]
 [   0   17  261    4    0   19    0]
 [   0    0    4   20    0    3    0]
 [   1   54    3    0   19    0    0]
 [   0   16   42    0    0   86    0]
 [  37    1    0    0    0    0  145]]


  if diff:


## Dealing with big data

### $  Four \ Point\ of\ View\: $
* #### $ volume $
* #### $ velocity $
* #### $ variety $
* #### $ veracity $

#### Creating some big datasets as examples

In [25]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_dataset = fetch_20newsgroups(shuffle=True,
                                        remove=('headers','footers','quotes'),random_state=6)
print ('Posts inside the data: %s' % np.shape(newsgroups_dataset.data))
print ('Average number of words for post: %0.0f' % 
       np.mean([len(text.split(' ')) for text in newsgroups_dataset.data]))

Posts inside the data: 11314
Average number of words for post: 206


In [34]:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=10**5, n_features=5, 
                          n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('large_dataset_10__5.csv', D, delimiter=",") 
# the saved file should be around 14.6 MB
del(D, X, y)

X,y = make_classification(n_samples=10**6, n_features=5, 
                          n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('large_dataset_10__6.csv', D, delimiter=",") 
# the saved file should be around 146 MB
del(D, X, y)

X,y = make_classification(n_samples=10**7, n_features=5, 
                          n_informative=3, random_state=101)
D = np.c_[y,X]
np.savetxt('large_dataset_10__7.csv', D, delimiter=",") 
# the saved file should be around 1,46 GB
del(D, X, y)

In [27]:
import os
os.remove('large_dataset_10__5.csv')
os.remove('large_dataset_10__6.csv')
os.remove('large_dataset_10__7.csv') 

#### Scalability with volume

In [35]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
streaming = pd.read_csv('large_dataset_10__7.csv', header=None, chunksize=10000)
learner = SGDClassifier(loss='log')
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
cumulative_accuracy = list()
for n,chunk in enumerate(streaming):
    if n == 0:
            minmax_scaler.fit(chunk.ix[:,1:].values)
    X = minmax_scaler.transform(chunk.ix[:,1:].values)
    X[X>1] = 1
    X[X<0] = 0  
    y = chunk.ix[:,0]
    if n > 8 :
        cumulative_accuracy.append(learner.score(X,y))
    learner.partial_fit(X,y,classes=np.unique(y))
print ('Progressive validation mean accuracy %0.3f' % np.mean(cumulative_accuracy))

Progressive validation mean accuracy 0.708


#### Keeping up with velocity

#### For Classification

- sklearn.naive_bayes.MultinomialNB
- sklearn.naive_bayes.BernoulliNB
- sklearn.linear_model.Perceptron
- sklearn.linear_model.SGDClassifier
- sklearn.linear_model.PassiveAggressiveClassifier

#### For Regression

- sklearn.linear_model.SGDRegressor
- sklearn.linear_model.PassiveAggressiveRegressor

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
import pandas as pd
from datetime import datetime
classifiers  = {
'SGDClassifier hinge loss' : SGDClassifier(loss='hinge', random_state=101),
'SGDClassifier log loss' : SGDClassifier(loss='log', random_state=101),
'Perceptron' : Perceptron(random_state=101),
'BernoulliNB' : BernoulliNB(),
'PassiveAggressiveClassifier' : PassiveAggressiveClassifier(random_state=101)
}
large_dataset = 'large_dataset_10__6.csv'
for algorithm in classifiers:
    start = datetime.now()
    minmax_scaler = MinMaxScaler(feature_range=(0, 1))
    streaming = pd.read_csv(large_dataset, header=None, chunksize=100)
    learner = classifiers[algorithm]
    cumulative_accuracy = list()
    for n,chunk in enumerate(streaming):
        y = chunk.ix[:,0]
        X = chunk.ix[:,1:]
        if n > 50 :
            cumulative_accuracy.append(learner.score(X,y))
        learner.partial_fit(X,y,classes=np.unique(y))
    elapsed_time = datetime.now() - start
    print (algorithm + ' : mean accuracy %0.3f in %s secs' % 
           (np.mean(cumulative_accuracy),elapsed_time.total_seconds()))

SGDClassifier hinge loss : mean accuracy 0.748 in 51.633 secs
SGDClassifier log loss : mean accuracy 0.740 in 49.498965 secs
Perceptron : mean accuracy 0.674 in 47.902658 secs
BernoulliNB : mean accuracy 0.650 in 52.629885 secs
PassiveAggressiveClassifier : mean accuracy 0.715 in 49.511889 secs


#### Dealing with variety

In [37]:
from sklearn.utils import murmurhash3_32
print (murmurhash3_32("something", seed=0, positive=True))

4141842261


In [38]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
def streaming():
    for response, item in zip(newsgroups_dataset.target, newsgroups_dataset.data):
        yield response, item
hashing_trick = HashingVectorizer(stop_words='english', norm = 'l2', non_negative=True)
learner = SGDClassifier(random_state=101)
texts = list()
targets = list()
for n,(target, text) in enumerate(streaming()):
    texts.append(text)
    targets.append(target)
    if n % 1000 == 0 and n >0:
        learning_chunk = hashing_trick.transform(texts)
        if n > 1000:
            last_validation_score = learner.score(learning_chunk, targets),
        learner.partial_fit(learning_chunk, targets, classes=[k for k in range(20)])
        texts, targets = list(), list()
print ('Last validation score: %0.3f' % last_validation_score)

Last validation score: 0.723


In [39]:
New_text = ['A 2014 red Toyota Prius v Five with fewer than 14K miles. Powered by a reliable 1.8L \
            four cylinder hybrid engine that averages 44mpg in the city and 40mpg on the highway.']
text_vector = hashing_trick.transform(New_text)
print (np.shape(text_vector), type(text_vector))
print ('Predicted newsgroup: %s' % newsgroups_dataset.target_names[learner.predict(text_vector)[0]])

(1, 1048576) <class 'scipy.sparse.csr.csr_matrix'>
Predicted newsgroup: rec.autos


## Deep Learning

In [40]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.utils import np_utils
import numpy as np

Using TensorFlow backend.


In [41]:
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


In [42]:
X_train.shape

(60000, 28, 28)

In [43]:
num_pixels = X_train.shape[1] * X_train.shape[2]
n_channels = 1
def preprocess(matrix):
    return matrix.reshape(matrix.shape[0],n_channels, 
                          matrix.shape[1],matrix.shape[2]).astype('float32') / 255.

In [44]:
X_train, X_test = preprocess(X_train), preprocess(X_test)

In [45]:
X_train.shape, X_train.dtype, np.max(X_train)

((60000, 1, 28, 28), dtype('float32'), 1.0)

In [46]:
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
num_classes = y_train.shape[1]
y_train.shape

(60000, 10)

In [47]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Flatten(input_shape=(1, 28, 28)))
    model.add(Dense(num_pixels, init='normal', activation='relu'))
    model.add(Dense(num_classes, init='normal', activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [48]:
def convolution_small():
    # create model
    model = Sequential()
    model.add(Convolution2D(32, 5, 5, border_mode='valid', input_shape=(1, 28, 28), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [56]:
def convolution_large():
    # create model
    model = Sequential()
    model.add(Convolution2D(30, 5, 5, border_mode='valid', input_shape=(1, 28, 28), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Convolution2D(15, 3, 3, activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [57]:
from keras import backend as K
K.set_image_dim_ordering('th')

# build the model
np.random.seed(101)
models = [('baseline', baseline_model()), 
          ('small', convolution_small()), 
          ('large', convolution_large())]

for name, model in models:
    print("With model:", name)
    # Fit the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=10,
              batch_size=100, verbose=2)
    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("Baseline Error: %.2f%%" % (100-scores[1]*100))
    print()

With model: baseline
Train on 60000 samples, validate on 10000 samples
Epoch 1/10
 - 13s - loss: 0.2294 - acc: 0.9340 - val_loss: 0.1132 - val_acc: 0.9678
Epoch 2/10
 - 12s - loss: 0.0904 - acc: 0.9734 - val_loss: 0.0829 - val_acc: 0.9736
Epoch 3/10
 - 12s - loss: 0.0567 - acc: 0.9830 - val_loss: 0.0648 - val_acc: 0.9782
Epoch 4/10
 - 12s - loss: 0.0378 - acc: 0.9889 - val_loss: 0.0699 - val_acc: 0.9784
Epoch 5/10
 - 13s - loss: 0.0258 - acc: 0.9927 - val_loss: 0.0775 - val_acc: 0.9770
Epoch 6/10
 - 13s - loss: 0.0202 - acc: 0.9942 - val_loss: 0.0649 - val_acc: 0.9806
Epoch 7/10
 - 13s - loss: 0.0157 - acc: 0.9957 - val_loss: 0.0653 - val_acc: 0.9799
Epoch 8/10
 - 13s - loss: 0.0108 - acc: 0.9971 - val_loss: 0.0638 - val_acc: 0.9821
Epoch 9/10
 - 13s - loss: 0.0097 - acc: 0.9973 - val_loss: 0.0680 - val_acc: 0.9802
Epoch 10/10
 - 13s - loss: 0.0082 - acc: 0.9974 - val_loss: 0.0982 - val_acc: 0.9740
Baseline Error: 2.60%

With model: small
Train on 60000 samples, validate on 10000 sampl

## A peek at Natural Language Processing (NLP)

#### Word tokenization

In [62]:
my_text = "The sexy job in the next 10 years will be statisticians. \
People think I'm joking, but who would've guessed that computer engineers \
would've been the sexy job of the 1990s?"
simple_tokens = my_text.split(' ')
print (simple_tokens)

['The', 'sexy', 'job', 'in', 'the', 'next', '10', 'years', 'will', 'be', 'statisticians.', 'People', 'think', "I'm", 'joking,', 'but', 'who', "would've", 'guessed', 'that', 'computer', 'engineers', "would've", 'been', 'the', 'sexy', 'job', 'of', 'the', '1990s?']


In [63]:
import nltk
nltk_tokens = nltk.word_tokenize(my_text)
print (nltk_tokens)

['The', 'sexy', 'job', 'in', 'the', 'next', '10', 'years', 'will', 'be', 'statisticians', '.', 'People', 'think', 'I', "'m", 'joking', ',', 'but', 'who', 'would', "'ve", 'guessed', 'that', 'computer', 'engineers', 'would', "'ve", 'been', 'the', 'sexy', 'job', 'of', 'the', '1990s', '?']


In [64]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer(strip_handles=True, reduce_len=True)

tweet = '@mate: I loooooooove this city!!!!!!! #love #foreverhere'
tt.tokenize(tweet)

[':', 'I', 'looove', 'this', 'city', '!', '!', '!', '#love', '#foreverhere']

#### Stemming

In [65]:
from nltk.stem import *
stemmer = LancasterStemmer()
print ([stemmer.stem(word) for word in nltk_tokens])

['the', 'sexy', 'job', 'in', 'the', 'next', '10', 'year', 'wil', 'be', 'stat', '.', 'peopl', 'think', 'i', "'m", 'jok', ',', 'but', 'who', 'would', "'ve", 'guess', 'that', 'comput', 'engin', 'would', "'ve", 'been', 'the', 'sexy', 'job', 'of', 'the', '1990s', '?']


#### Word Tagging

In [66]:
import nltk
print (nltk.pos_tag(nltk_tokens))

[('The', 'DT'), ('sexy', 'JJ'), ('job', 'NN'), ('in', 'IN'), ('the', 'DT'), ('next', 'JJ'), ('10', 'CD'), ('years', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('statisticians', 'NNS'), ('.', '.'), ('People', 'NNS'), ('think', 'VBP'), ('I', 'PRP'), ("'m", 'VBP'), ('joking', 'VBG'), (',', ','), ('but', 'CC'), ('who', 'WP'), ('would', 'MD'), ("'ve", 'VBP'), ('guessed', 'VBN'), ('that', 'IN'), ('computer', 'NN'), ('engineers', 'NNS'), ('would', 'MD'), ("'ve", 'VBP'), ('been', 'VBN'), ('the', 'DT'), ('sexy', 'JJ'), ('job', 'NN'), ('of', 'IN'), ('the', 'DT'), ('1990s', 'CD'), ('?', '.')]


- take: VB (verb, base form)
- took: VBD (verb, past tense)
- taking: VBG (verb, gerund)
- taken: VBN (verb, past participle)
- take: VBP (verb, singular present tense)
- takes: VBZ (verb, third-person singular present tense)

#### Named Entity Recognition (NER)

In [67]:
text = "Elvis Aaron Presley was an American singer and actor. Born in Tupelo, Mississippi, \
when Presley was 13 years old he and his family relocated to Memphis, Tennessee."
chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
print (chunks)

(S
  (PERSON Elvis/NNP)
  (PERSON Aaron/NNP Presley/NNP)
  was/VBD
  an/DT
  (GPE American/JJ)
  singer/NN
  and/CC
  actor/NN
  ./.
  Born/VBN
  in/IN
  (GPE Tupelo/NNP)
  ,/,
  (GPE Mississippi/NNP)
  ,/,
  when/WRB
  (PERSON Presley/NNP)
  was/VBD
  13/CD
  years/NNS
  old/JJ
  he/PRP
  and/CC
  his/PRP$
  family/NN
  relocated/VBD
  to/TO
  (GPE Memphis/NNP)
  ,/,
  (GPE Tennessee/NNP)
  ./.)


#### Stopwords

In [68]:
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS
print (stop_words)

frozenset({'on', 'would', 'three', 'whereafter', 'here', 'back', 'ever', 'herself', 'sometime', 'therein', 'was', 'meanwhile', 'whatever', 'along', 'too', 'due', 'as', 'became', 'him', 'through', 'that', 'very', 'do', 'in', 'none', 'most', 'above', 'system', 'when', 'beyond', 'one', 'therefore', 'so', 'we', 'amongst', 'sometimes', 'many', 'seeming', 'towards', 'hereupon', 'be', 'someone', 'until', 'often', 'although', 'find', 'without', 'next', 'them', 'hence', 'within', 'a', 'been', 'onto', 'whether', 'alone', 'thence', 'beside', 'cry', 'fifty', 'five', 'formerly', 'this', 'cannot', 'front', 'inc', 'otherwise', 'six', 'sincere', 'or', 'hasnt', 'two', 'somehow', 'first', 'amount', 'keep', 'noone', 'moreover', 'twenty', 'few', 'thereby', 'nothing', 'except', 'indeed', 'any', 'itself', 'me', 'less', 'well', 'latter', 'beforehand', 'it', 'show', 'some', 'hundred', 'thereupon', 'third', 'throughout', 'seems', 'under', 'both', 'namely', 'please', 'while', 'everything', 'herein', 'an', 'at',

In [69]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [70]:
print(stopwords.words('german'))

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es', 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'euch', 'im'

####  A complete data science example: text classification

In [71]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
to_remove = ('headers', 'footers', 'quotes')
twenty_sci_news_train = fetch_20newsgroups(subset='train', remove=to_remove, categories=categories)
twenty_sci_news_test = fetch_20newsgroups(subset='test', remove=to_remove, categories=categories)

In [72]:
tf_vect = TfidfVectorizer()
X_train = tf_vect.fit_transform(twenty_sci_news_train.data)
X_test = tf_vect.transform(twenty_sci_news_test.data)
y_train = twenty_sci_news_train.target
y_test = twenty_sci_news_test.target

In [73]:
clf = SGDClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print ("Accuracy=", accuracy_score(y_test, y_pred))

Accuracy= 0.8848101265822785


In [74]:
def clean_and_stem_text(text):
    tokens = nltk.word_tokenize(text.lower())
    clean_tokens = [word for word in tokens if word not in stop_words]
    stem_tokens = [stemmer.stem(token) for token in clean_tokens]
    return " ".join(stem_tokens)
cleaned_docs_train = [clean_and_stem_text(text) for text in twenty_sci_news_train.data]
cleaned_docs_test = [clean_and_stem_text(text) for text in twenty_sci_news_test.data]

In [77]:
X1_train = tf_vect.fit_transform(cleaned_docs_train)
X1_test = tf_vect.transform(cleaned_docs_test)
clf.fit(X1_train, y_train)
Y1_pred = clf.predict(X1_test)
print ("Accuracy=", accuracy_score(y_test, Y1_pred))

Accuracy= 0.889873417721519
