In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
len(df)

5572

In [8]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [9]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [11]:
# X = feature data
X = df[['length', 'punct']]
# y = label
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
X_train.shape

(3900, 2)

In [14]:
X_test.shape

(1672, 2)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
lr_model = LogisticRegression(solver='lbfgs')

In [17]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
from sklearn import metrics

In [19]:
predictions = lr_model.predict(X_test)

In [20]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [21]:
metrics.confusion_matrix(y_test, predictions)

array([[1404,   44],
       [ 219,    5]])

In [22]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['ham','spam'], columns=['ham','spam'])
df

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [24]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [25]:
print(metrics.accuracy_score(y_test, predictions))

0.8427033492822966


In [26]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

predictions = nb_model.predict(X_test)

print(metrics.confusion_matrix(y_test, predictions))

[[1438   10]
 [ 224    0]]


In [27]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [29]:
from sklearn.svm import SVC

In [30]:
svc_model = SVC(gamma='auto')

svc_model.fit(X_train, y_train)

predictions = svc_model.predict(X_test)

print(metrics.confusion_matrix(y_test, predictions))

[[1373   75]
 [ 121  103]]


In [32]:
print(metrics.accuracy_score(y_test, predictions))

0.8827751196172249


In [5]:
# X = feature data
X = df['message']
# y = label
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
count_vect = CountVectorizer()

In [9]:
X_train_counts = count_vect.fit_transform(X_train)

In [10]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_transformer = TfidfTransformer()

In [13]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [15]:
from sklearn.svm import LinearSVC

In [16]:
clf = LinearSVC()

In [17]:
from sklearn.pipeline import Pipeline

In [21]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [22]:
text_clf.fit(X_train, y_train)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [23]:
predictions = text_clf.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [25]:
print(confusion_matrix(y_test, predictions))

[[1586    7]
 [  12  234]]


In [26]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

   micro avg       0.99      0.99      0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [28]:
print(accuracy_score(y_test, predictions))

0.989668297988037


In [29]:
text_clf.predict(["Hey how are you doing?"])

array(['ham'], dtype=object)