In [243]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem  import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [244]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/venkatesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading data

In [245]:
df = pd.read_csv("data/ham_spam.txt", sep="\t", names=["label","message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [246]:
df.shape

(5572, 2)

In [247]:
def preprocess(document, stem = True):
    document = document.lower()
    words = word_tokenize(document)
    ## Removing stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem == True:
        words = [SnowballStemmer("english").stem(word) for word in words]
    else:
        words = [WordNetLemmatizer().lemmatize(word) for word in words]
    return " ".join(words)

In [248]:
doc = preprocess(df["message"][0], False)
doc

'go jurong point , crazy .. available bugis n great world la e buffet ... cine got amore wat ...'

In [249]:
df["message"] = df["message"].map(preprocess, True)

In [250]:
df.head()

Unnamed: 0,label,message
0,ham,"go jurong point , crazi .. avail bugi n great ..."
1,ham,ok lar ... joke wif u oni ...
2,spam,free entri 2 wkli comp win fa cup final tkts 2...
3,ham,u dun say earli hor ... u c alreadi say ...
4,ham,"nah n't think goe usf , live around though"


In [251]:
## Creating train test data

In [252]:
from sklearn.model_selection import train_test_split

In [253]:
y = df.pop("label")
X = df

In [254]:
X.head()

Unnamed: 0,message
0,"go jurong point , crazi .. avail bugi n great ..."
1,ok lar ... joke wif u oni ...
2,free entri 2 wkli comp win fa cup final tkts 2...
3,u dun say earli hor ... u c alreadi say ...
4,"nah n't think goe usf , live around though"


In [255]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=45)

In [256]:
X_train.shape

(3900, 1)

In [257]:
X_train["message"]

1201                    otherwis part time job na-tuit ..
77                           like peopl much : ) shi pa .
1346                  que pase un buen tiempo someth like
2753                           sat right ? okay thank ...
3097    walk mom . right stagwood pass right wintersto...
                              ...                        
4473                      3. receiv mobil content . enjoy
580     arngd marriag u r walkin unfortunt snake bite ...
163           'm love . 'm excit day spend . make happi .
4703                                           anytim ...
3616    enjoy watch play footbal basketbal . anyth out...
Name: message, Length: 3900, dtype: object

In [258]:
df["message"].dtype

dtype('O')

In [259]:
y_train.shape

(3900,)

In [260]:
vectorizer = TfidfVectorizer()
tf_idf_model = vectorizer.fit_transform(X_train["message"])

In [261]:
dataframe = pd.DataFrame(tf_idf_model.toarray(), columns = vectorizer.get_feature_names())

In [262]:
dataframe["label"] = y

In [263]:
dataframe.head()

Unnamed: 0,00,000,000pes,008704050406,0089,01223585236,01223585334,0125698789,02,0207,...,zindgi,zoe,zogtorius,zouk,zyada,ãº1,ã¼,ãœ,éˆ,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham


In [264]:
from sklearn import svm


In [265]:
svm_model = svm.SVC(C=1000)
svm_model.fit(tf_idf_model, y_train)

SVC(C=1000)

In [266]:
from sklearn.metrics import confusion_matrix
X_test = vectorizer.transform(X_test["message"])
y_pred = svm_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[1446    2]
 [  30  194]]


## Hyperparameter tunning

In [267]:
# conduct (grid search) cross-validation to find the optimal values 
# of cost C and the choice of kernel

from sklearn.model_selection import GridSearchCV

parameters = {'C':[1, 10, 100, 1000], 
             'gamma': [1e-2, 1e-3, 1e-4],
             'kernel':["linear","rbf"]}

# instantiate a model 
svc_grid_search = svm.SVC()

# create a classifier to perform grid search
clf = GridSearchCV(svc_grid_search, param_grid=parameters, scoring='accuracy', n_jobs=-1, verbose=1)

# fit
clf.fit(tf_idf_model, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    6.1s finished


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1, 10, 100, 1000],
                         'gamma': [0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'rbf']},
             scoring='accuracy', verbose=1)

In [268]:
# results
cv_results = pd.DataFrame(clf.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.327062,0.013109,0.062805,0.004376,1,0.01,linear,"{'C': 1, 'gamma': 0.01, 'kernel': 'linear'}",0.988462,0.976923,0.973077,0.976923,0.980769,0.979231,0.005217,4
1,0.38235,0.004851,0.111073,0.003313,1,0.01,rbf,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.866667,0.866667,0.865385,0.865385,0.865385,0.865897,0.000628,19
2,0.324802,0.012261,0.060432,0.001721,1,0.001,linear,"{'C': 1, 'gamma': 0.001, 'kernel': 'linear'}",0.988462,0.976923,0.973077,0.976923,0.980769,0.979231,0.005217,4
3,0.330709,0.003033,0.086803,0.003775,1,0.001,rbf,"{'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}",0.866667,0.866667,0.865385,0.865385,0.865385,0.865897,0.000628,19
4,0.307995,0.006324,0.060815,0.001358,1,0.0001,linear,"{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}",0.988462,0.976923,0.973077,0.976923,0.980769,0.979231,0.005217,4
5,0.283454,0.002368,0.081022,0.001877,1,0.0001,rbf,"{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}",0.866667,0.866667,0.865385,0.865385,0.865385,0.865897,0.000628,19
6,0.290827,0.008649,0.057768,0.001727,10,0.01,linear,"{'C': 10, 'gamma': 0.01, 'kernel': 'linear'}",0.987179,0.976923,0.973077,0.971795,0.985897,0.978974,0.006415,7
7,0.368262,0.004678,0.100653,0.003193,10,0.01,rbf,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}",0.970513,0.946154,0.952564,0.948718,0.951282,0.953846,0.008619,18
8,0.305834,0.011066,0.058717,0.00388,10,0.001,linear,"{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}",0.987179,0.976923,0.973077,0.971795,0.985897,0.978974,0.006415,7
9,0.390386,0.006926,0.107346,0.003336,10,0.001,rbf,"{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}",0.866667,0.866667,0.865385,0.865385,0.865385,0.865897,0.000628,19


In [270]:
clf.best_params_

{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [271]:
clf.best_score_

0.9802564102564103

## Best params model

In [272]:
best_svm_model = svm.SVC(C=100, gamma=0.01, kernel='rbf')
best_svm_model.fit(tf_idf_model, y_train)

SVC(C=100, gamma=0.01)

In [273]:
y_pred_best_svm = best_svm_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_best_svm))

[[1443    5]
 [  25  199]]


In [280]:
def pred(msg):
    msg = vectorizer.transform([msg])
    prediction = svm_model.predict(msg)
    return prediction[0]

In [281]:
pred("Call me, urgent")

'ham'

In [284]:
import joblib
import pickle

In [278]:
# save the model to disk
filename = 'ham_spam_model.sav'
joblib.dump(model, filename)

['ham_spam_model.sav']

In [288]:
pickle.dump(vectorizer,open("vectorizer.pkl","wb"))

In [289]:
loaded_vectorizer = pickle.load(open("vectorizer.pkl", 'rb'))
loaded_model = joblib.load(filename)

In [296]:
def predict_msg_from_loaded(msg):
    msg = loaded_vectorizer.transform([msg])
    prediction = loaded_model.predict(msg.toarray())
    return prediction[0]

In [299]:
predict_msg_from_loaded("Call me urgent")

'ham'