# Sentence Analysis

In [2]:
import string
from nltk.corpus import stopwords

In [4]:
#view 10 stop words
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
#create a test sentence
test_sentence = 'This is my first test string. Wow!! we are doing just fine'

In [10]:
#remove all punctuations and print them
no_punc = [char for char in test_sentence if char not in string.punctuation]
no_punc

['T',
 'h',
 'i',
 's',
 ' ',
 'i',
 's',
 ' ',
 'm',
 'y',
 ' ',
 'f',
 'i',
 'r',
 's',
 't',
 ' ',
 't',
 'e',
 's',
 't',
 ' ',
 's',
 't',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'W',
 'o',
 'w',
 ' ',
 'w',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'd',
 'o',
 'i',
 'n',
 'g',
 ' ',
 'j',
 'u',
 's',
 't',
 ' ',
 'f',
 'i',
 'n',
 'e']

In [11]:
#eliminate punctuation and print them as a whole sentence
no_punc = ''.join(no_punc)
no_punc

'This is my first test string Wow we are doing just fine'

In [12]:
#split sentence into words
no_punc.split()

['This',
 'is',
 'my',
 'first',
 'test',
 'string',
 'Wow',
 'we',
 'are',
 'doing',
 'just',
 'fine']

In [14]:
#eliminate stopwords
clean_sentence = [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]

In [15]:
#print clean sentence
clean_sentence

['first', 'test', 'string', 'Wow', 'fine']

# Load Datasets using scikit learn

In [16]:
from sklearn.datasets import load_digits

In [17]:
digits_dataset = load_digits()

In [18]:
digits_dataset.DESCR

".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 1797\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttps://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixel

In [19]:
type(digits_dataset)

sklearn.utils.Bunch

In [20]:
#view data
digits_dataset.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [21]:
#view target
digits_dataset.target

array([0, 1, 2, ..., 8, 9, 8])

# Bag of Words

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
#instantiate the vectorizer
vectorizer = CountVectorizer()

In [26]:
#create 3 documents
doc1 = 'Hi How are you?'
doc2 = 'Today is very very very pleasant day we can have so much of fun fun fun'
doc3 = 'This was an amazing experience'

In [27]:
list_of_docs = [doc1,doc2,doc3]

In [29]:
#fit them as bag of words
bag_of_words = vectorizer.fit(list_of_docs)

In [30]:
#check bag of words
bag_of_words

CountVectorizer()

In [31]:
#apply transform method
bag_of_words = vectorizer.transform(list_of_docs)

In [33]:
print(bag_of_words)

  (0, 2)	1
  (0, 8)	1
  (0, 9)	1
  (0, 20)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	3
  (1, 7)	1
  (1, 10)	1
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 14)	1
  (1, 16)	1
  (1, 17)	3
  (1, 19)	1
  (2, 0)	1
  (2, 1)	1
  (2, 5)	1
  (2, 15)	1
  (2, 18)	1


In [34]:
#verify the vocabulary of repeated words
print(vectorizer.vocabulary_.get('very'))
print(vectorizer.vocabulary_.get('fun'))

17
6


In [35]:
type(bag_of_words)

scipy.sparse.csr.csr_matrix

# Pipeline and Grid Search

In [38]:
import pandas as pd
import string
from pprint import pprint
from time import time

In [39]:
#import dataset
df_spam_collection = pd.read_csv('SpamCollection',sep = '\t',names=['response','messages'])

In [40]:
df_spam_collection.head()

Unnamed: 0,response,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
#import text processing libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [43]:
#import SGD classifier
from sklearn.linear_model import SGDClassifier

In [45]:
#import gridsearch
from sklearn.model_selection import GridSearchCV

In [46]:
#import pipeline
from sklearn.pipeline import Pipeline

In [47]:
#define the pipeline
pipeline = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',SGDClassifier())   
])

In [52]:
#parameters for grid search
parameters = {'tfidf__use_idf':(True,False)}

In [53]:
#perform gridsearch with pipelines and parameters
grid_search = GridSearchCV(pipeline,parameters,n_jobs=1,verbose = 1)
print('Performing Grid Search....')
pprint('Parameters :')
print(parameters)
t0 = time()
grid_search.fit(df_spam_collection['messages'],df_spam_collection['response'])
print('Done in %0.3f'%(time() - t0))
print('Done')

Performing Grid Search....
'Parameters :'
{'tfidf__use_idf': (True, False)}
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Done in 2.265
Done
