In [None]:
import pandas as pd
import numpy as np
data_file=r"/Users/lalitsachan/Dropbox/PDS V3/Data/SMSSpamCollection.txt"

In [None]:
sd=pd.read_csv(data_file,delimiter='\t',header=None,names=['target','message'])

In [None]:
sd.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
stop = set(stopwords.words('english'))
# stop

In [None]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [None]:
sd_train,sd_test=train_test_split(sd,test_size=0.2,random_state=2)

In [None]:
tfidf= TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)

In [None]:
tfidf.fit(sd_train['message'])

In [None]:
train_data=tfidf.transform(sd_train['message'])

In [None]:
test_data=tfidf.transform(sd_test['message'])

In [None]:
clf=MultinomialNB()

In [None]:
clf.fit(train_data,sd_train['target'])

In [None]:
clf.predict_proba(test_data[6,:])

In [None]:
clf.classes_

In [None]:
list(sd_test['message'])[6]

## With Python pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe1=Pipeline([
    ('tfidf',TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)),
    ('classfier',MultinomialNB())
])

In [None]:
pipe1.fit(sd_train['message'],sd_train['target'])

In [None]:
pipe1.predict_proba(sd_test['message'])

# Pipeline with Feature Union

In [None]:
file=r'/Users/lalitsachan/Dropbox/Trainings/EY _ Nov _ 2017/Existing Base.csv'

bd=pd.read_csv(file)

In [None]:
bd.head()

In [None]:
bd.nunique()

In [None]:
bd.dtypes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class VarTypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self,vartype,ignore_var):
        self.vartype=vartype
        self.ignore_var=ignore_var
    
    def fit(self,x,y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(self.vartype).drop(self.ignore_var,axis=1)

In [None]:
class get_dummies_PipeLineFriendly(BaseEstimator, TransformerMixin):
    
    def __init__(self,freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        
    def fit(self,x,y=None):
        data_cols=x.columns
        for col in data_cols:
            k=x[col].value_counts()
            cats=k.index[k>self.freq_cutoff][:-1]
            self.var_cat_dict[col]=cats
        return self
            
    def transform(self,x,y=None):
        dummy_data=x.copy()
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)
            del dummy_data[col]
        return dummy_data

In [None]:
from sklearn.pipeline import Pipeline,FeatureUnion

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
bd_train,bd_test=train_test_split(bd,test_size=0.2,random_state=2)

In [None]:
x_train=bd_train.drop('Revenue Grid',axis=1)
x_test=bd_test.drop('Revenue Grid',axis=1)
y_train=bd_train['Revenue Grid']
y_test=bd_test['Revenue Grid']

In [None]:
bd.dtypes

In [None]:
cat_pipe=Pipeline([
    ('cat_var',VarTypeSelector(['object'],ignore_var=['post_code','post_area'])),
    ('dummies',get_dummies_PipeLineFriendly(100))
])

In [None]:
pipe2=Pipeline([
    ('features',FeatureUnion([
        ('cat_pipe',cat_pipe),
        ('num_var',VarTypeSelector(['int64','float64'],ignore_var=['REF_NO']))
    ])),
    ('clf',LogisticRegression())
])

In [None]:
pipe2.fit(x_train,y_train)

In [None]:
pipe2.predict_proba(x_test)

## Save python objects to use later

In [2]:
from sklearn.externals import joblib

In [None]:
joblib.dump(pipe1,'my_model_pipeline.pkl')

## Loading models

In [3]:
import pandas as pd
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [4]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [5]:
mymodel=open('my_model_pipeline.pkl','rb')

In [6]:
pipe=joblib.load(mymodel)

In [7]:
my_msg=['I‘m going to try for 2 months ha ha only joking',
        '''Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. 
        Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's''']
my_df=pd.DataFrame({'message':my_msg})

In [8]:
my_df

Unnamed: 0,message
0,I‘m going to try for 2 months ha ha only joking
1,Free entry in 2 a wkly comp to win FA Cup fina...


In [9]:
pipe.predict_proba(my_df['message'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([[0.95931778, 0.04068222],
       [0.01745318, 0.98254682]])

In [10]:
pipe.classes_

array(['ham', 'spam'], dtype='<U4')