In [5]:
import nltk

In [6]:
#nltk.download_gui()  # downloaded stop words

In [7]:
messages = [line.rstrip() for line in open('/Users/sudhanshukumar/Documents/Development/Machine Learning/0 csv files/SMSSpamCollection.csv')]
print(len(messages))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/sudhanshukumar/Documents/Development/Machine Learning/0 csv files/SMSSpamCollection.csv'

In [None]:
for message_no, message in enumerate(messages[:3]):
    print(message_no, message)
    print('\n')

**Due to the spacing we can tell that this is a TSV ("tab separated values") file**

In [None]:
import pandas as pd

In [None]:
messages=pd.read_csv("/Users/sudhanshukumar/Documents/Development/Machine Learning/0 csv files/SMSSpamCollection.csv",sep='\t',
                    names=["label","message"])

In [None]:
messages.head()

In [None]:
messages.describe()    # a lot of messages are repeated

In [None]:
messages.groupby("label").describe()

In [None]:
# adding a new feature -> len (number of letters in a text message)

messages["length"]=messages["message"].apply(len)
messages.head()

## Data Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
messages['length'].plot(bins=50, kind='hist',figsize=(12,4)) 

In [None]:
messages.length.describe()   #check out the max column

In [None]:
# the longest message could be a outlier
messages[messages['length'] == 910]['message'].iloc[0]

In [None]:
# x-axis ="length", separated by label
messages.hist(column='length', by='label', bins=50,figsize=(12,4))

#from the plots below we can conclude that-> spam messages(130-170) tend to be longer than ham messages(10-50)

## Text Pre-processing

In [None]:
#sample

import string

mess = 'Sample message! Notice: it has punctuation.'

# Check characters to see if they are in punctuation
nopunc = [char for char in mess if char not in string.punctuation]

# Join the characters again to form the string.
nopunc = ''.join(nopunc)
nopunc

In [None]:
nopunc.split()

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10] # Show some stop words

In [None]:
# return word if lower case version of that word is not in stopwords

clean_mess=[word for word in nopunc.split() if word.lower() not in stopwords.words("english")]
clean_mess

In [None]:
# Create a func to carry out this task

def text_process(mess):
    
    nopunc=[char for char in mess if char not in string.punctuation]
    
    nopunc=''.join(nopunc)
    
    return [word for word in nopunc.split() if word.lower() not in stopwords.words("english")]

In [None]:
messages["message"].head(3).apply(text_process)  #returns list of words

## Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

bow_transformer=CountVectorizer(analyzer=text_process).fit(messages["message"])

In [None]:
print(len(bow_transformer.vocabulary_))

In [None]:
message4 = messages['message'][3]
print(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

#### This means that there are seven unique words in message number 4 (after removing common stop words). Two of them appear twice, the rest only once. Let's go ahead and check and confirm which ones appear twice:



In [None]:
print(bow_transformer.get_feature_names()[4068])
print(bow_transformer.get_feature_names()[9554])

#### Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of messages

In [None]:
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
messages_bow.shape

## Note:
**TfidfVectorizer is used on sentences, while TfidfTransformer is used on an existing count matrix, such as one returned by CountVectorizer**

**With Tfidftransformer you will compute word counts using CountVectorizer and then compute the IDF values and only then compute the Tf-idf scores. With Tfidfvectorizer you will do all three steps at once.**

In [None]:
 #TF-IDF
    
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer= TfidfTransformer()
tfidf_transformer.fit(messages_bow)

In [None]:
#sample-> how it looks
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

#tfidf value is now associated with this message -> bow4

In [None]:
#final after pre-processing

messages_tfidf = tfidf_transformer.transform(messages_bow) 

In [None]:
X=messages_tfidf
y=messages["label"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#training the model

from sklearn.svm import LinearSVC

spam_detect_model= LinearSVC(C=4)
spam_detect_model.fit(X_train,y_train)

#### use train test split as a good practice

In [None]:
spam_detect_model.score(X_test,y_test)

In [None]:
spam_detect_model.predict(messages_tfidf)[3]

In [None]:
messages["label"][3]

## Using just CountVectorizer without analyzer

In [None]:
cv=CountVectorizer()

X=cv.fit_transform(messages["message"])
y=messages["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf=LinearSVC()

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

In [None]:
msg_train, msg_test, label_train, label_test =train_test_split(messages['message'], messages['label'], test_size=0.2)

#### We will use SciKit Learn's pipeline capabilities to store a pipeline of workflow. This will allow us to set up all the transformations that we will do to the data for future use. Let's see an example of how it works:

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
#creating pipeline object (imp part)
# TfidfVectorizer can also be used directly to reduce steps

pipeline= Pipeline([
    ("bow",CountVectorizer(analyzer=text_process)),
    ("tfidf",TfidfTransformer()),
    ("classifier",LinearSVC(C=6)),
])

In [None]:
pipeline.fit(msg_train,label_train)

In [None]:
# improved performance with LinearSVC()

pipeline.score(msg_test,label_test)

In [None]:
from sklearn.metrics import classification_report

predictions=pipeline.predict(msg_test)
print(classification_report(predictions,label_test))

In [None]:
import joblib

In [None]:
joblib.dump(predictions,'predictions_joblib')

In [None]:
joblib.dump(cv,'cv_joblib')

In [None]:
joblib.dump(clf,'clf_joblib')