In [11]:
import joblib

In [1]:
#Write my own transformers
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
try:
    from sklearn.model_selection import train_test_split
except:
    from sklearn.cross_validation import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion, make_union

from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
df = pd.read_csv("data/SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [3]:
X = df["text"]
y = df["target"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import re

class CapitalDocTransformer(BaseEstimator, TransformerMixin):
    """
    Transforms the input document to either 1 or 0. 
    Returns 1 if all words in the document are Capital else Returns 0.
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # if all the words in each line of X is A-Z only, then return 1 else return 0
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1, 1)

In [6]:
cp_transformer = CapitalDocTransformer()
cp_transformer.fit_transform(X[:])

array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [0]])

In [7]:
# Build a Model
log_reg_model = Pipeline(steps=[
        ("features", make_union(CapitalDocTransformer(), CountVectorizer())),
        ("model", LogisticRegression())
        ])

In [8]:
log_reg_model.fit(X_train, y_train)
# Accuracy
log_reg_model.score(X_test, y_test)

0.98504784688995217

In [12]:
joblib.dump(log_reg_model, "models/spam_ham.pkl")

['models/spam_ham.pkl']

In [14]:
new_model= joblib.load("models/spam_ham.pkl")

In [15]:
new_model.predict(["can't wait for this election to be over"])

array(['ham'], dtype=object)