### Import modules

In [19]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

### Read the dataset

In [2]:
data = pd.read_table('tweets.tsv', usecols=[0,1])

### Split the data into features and labels

In [3]:
X = data.SentimentText
y = data.Sentiment

### split data into train and test sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Instantiate the `Vectorizer`

In [5]:
vectorizer = CountVectorizer()

### Fit training data to the vectorizer

In [6]:
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

### Create the Document Term Matrix

In [7]:
X_train_dtm = vectorizer.transform(X_train)

In [8]:
X_test_dtm = vectorizer.transform(X_test)

### Build and Train a model

In [9]:
nb = MultinomialNB()

In [10]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
y_pred_class = nb.predict(X_test_dtm)

In [12]:
metrics.accuracy_score(y_test, y_pred_class)

0.75975581926061164

### Tuning the Count Vectorizer

#### Default

In [16]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
vectorizer = CountVectorizer(stop_words='english')

In [21]:
vectorizer = CountVectorizer(ngram_range=(1, 2))