# Count Vectorization

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv',sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


# Count Vectorization

In [4]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('./UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv',sep='\t')

In [6]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [7]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [8]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [10]:
X = df['message']

In [11]:
y = df['label']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

## Train Count Vectorizer model (METHOD 1)
### Requires more steps

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
# create instance of CountVectorizer
count_vect = CountVectorizer()

Fit vectorizer to the data. 
Builds a vocabulary, counts the number of words, etc.

In [36]:
# TWO STEPS
#count_vect.fit(X_train)
#X_train_counts = count_vect.transform(X_train)

# ALTERNATIVELY
# ONE STEP
X_train_counts = count_vect.fit_transform(X_train)

In [16]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [17]:
X_train.shape

(3733,)

In [18]:
X_train_counts.shape

(3733, 7082)

In [19]:
from sklearn.feature_extraction.text import TfidfTransformer

In [20]:
tfidf_transformer = TfidfTransformer()

In [23]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [24]:
X_train_tfidf.shape

(3733, 7082)

## Train Count Vectorizer model (METHOD 2)
### Requires less steps

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vectorizer = TfidfVectorizer()

In [17]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [18]:
from sklearn.svm import LinearSVC

In [19]:
clf = LinearSVC()

In [20]:
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [21]:
from sklearn.pipeline import Pipeline

In [22]:
# takes in a list of tuples
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [23]:
text_clf.fit(X_train,y_train)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Test Count Vectorizer model

In [24]:
predictions = text_clf.predict(X_test)

In [26]:
from sklearn.metrics import confusion_matrix,classification_report

In [27]:
print(confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [28]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [29]:
from sklearn import metrics

In [30]:
metrics.accuracy_score(y_test,predictions)

0.989668297988037

## Test model on some text
Evaluate how model reacts to classifying spam vs. non-spam (ham) text.

In [31]:
text_clf.predict(["Hi how are you doing today?"])

array(['ham'], dtype=object)

In [32]:
text_clf.predict(["Congratulations! You've been selected as a winner. TEXT WON to 44523 congratulations free winner!!!!"])

array(['spam'], dtype=object)