In [1]:
import pandas as pd
import numpy as np
data_file=r"C:\Users\Ranjith James\Desktop\Desktop 11-25-19\Python\Data\Data\SMSSpamCollection.txt"

In [2]:
sd=pd.read_csv(data_file,delimiter='\t',header=None,names=['target','message'])

In [3]:
sd.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
stop = set(stopwords.words('english'))
# stop

In [6]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [7]:
sd_train,sd_test=train_test_split(sd,test_size=0.2,random_state=2)

In [8]:
tfidf= TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)

In [9]:
tfidf.fit(sd_train['message'])

TfidfVectorizer(analyzer=<function split_into_lemmas at 0x000002219B109948>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=3000, max_features=None,
                min_df=20, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [10]:
train_data=tfidf.transform(sd_train['message'])

In [11]:
test_data=tfidf.transform(sd_test['message'])

In [12]:
clf=MultinomialNB()

In [13]:
clf.fit(train_data,sd_train['target'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
clf.predict_proba(test_data[6,:])

array([[0.97932328, 0.02067672]])

In [15]:
clf.classes_

array(['ham', 'spam'], dtype='<U4')

In [16]:
list(sd_test['message'])[6]

'ELLO BABE U OK?'

## With Python pipeline

In [17]:
from sklearn.pipeline import Pipeline

In [18]:
pipe1=Pipeline([
    ('tfidf',TfidfVectorizer(analyzer=split_into_lemmas,min_df=20,max_df=3000)),
    ('classfier',MultinomialNB())
])

In [19]:
pipe1.fit(sd_train['message'],sd_train['target'])

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer=<function split_into_lemmas at 0x000002219B109948>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=3000, max_features=None,
                                 min_df=20, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classfier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose

In [20]:
pipe1.predict_proba(sd_test['message'])

array([[0.95823681, 0.04176319],
       [0.99030404, 0.00969596],
       [0.99119974, 0.00880026],
       ...,
       [0.94020892, 0.05979108],
       [0.97748664, 0.02251336],
       [0.0141158 , 0.9858842 ]])

# Pipeline with Feature Union

In [None]:
file=r'/Users/lalitsachan/Dropbox/Trainings/EY _ Nov _ 2017/Existing Base.csv'

bd=pd.read_csv(file)

In [None]:
bd.head()

In [None]:
bd.nunique()

In [None]:
bd.dtypes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
class VarTypeSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self,vartype,ignore_var):
        self.vartype=vartype
        self.ignore_var=ignore_var
    
    def fit(self,x,y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(self.vartype).drop(self.ignore_var,axis=1)

In [None]:
class get_dummies_PipeLineFriendly(BaseEstimator, TransformerMixin):
    
    def __init__(self,freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        
    def fit(self,x,y=None):
        data_cols=x.columns
        for col in data_cols:
            k=x[col].value_counts()
            cats=k.index[k>self.freq_cutoff][:-1]
            self.var_cat_dict[col]=cats
        return self
            
    def transform(self,x,y=None):
        dummy_data=x.copy()
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                name=col+'_'+cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int)
            del dummy_data[col]
        return dummy_data

In [None]:
from sklearn.pipeline import Pipeline,FeatureUnion

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
bd_train,bd_test=train_test_split(bd,test_size=0.2,random_state=2)

In [None]:
x_train=bd_train.drop('Revenue Grid',axis=1)
x_test=bd_test.drop('Revenue Grid',axis=1)
y_train=bd_train['Revenue Grid']
y_test=bd_test['Revenue Grid']

In [None]:
bd.dtypes

In [None]:
cat_pipe=Pipeline([
    ('cat_var',VarTypeSelector(['object'],ignore_var=['post_code','post_area'])),
    ('dummies',get_dummies_PipeLineFriendly(100))
])

In [None]:
pipe2=Pipeline([
    ('features',FeatureUnion([
        ('cat_pipe',cat_pipe),
        ('num_var',VarTypeSelector(['int64','float64'],ignore_var=['REF_NO']))
    ])),
    ('clf',LogisticRegression())
])

In [None]:
pipe2.fit(x_train,y_train)

In [None]:
pipe2.predict_proba(x_test)

## Save python objects to use later

In [2]:
from sklearn.externals import joblib

In [None]:
joblib.dump(pipe1,'my_model_pipeline.pkl')

## Loading models

In [3]:
import pandas as pd
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [4]:
def split_into_lemmas(message):
    message=message.lower()
    words = word_tokenize(message)
    words_sans_stop=[]
    for word in words :
        if word in stop:continue
        words_sans_stop.append(word)
    return [lemma.lemmatize(word) for word in words_sans_stop]

In [5]:
mymodel=open('my_model_pipeline.pkl','rb')

In [6]:
pipe=joblib.load(mymodel)

In [7]:
my_msg=['I‘m going to try for 2 months ha ha only joking',
        '''Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. 
        Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's''']
my_df=pd.DataFrame({'message':my_msg})

In [8]:
my_df

Unnamed: 0,message
0,I‘m going to try for 2 months ha ha only joking
1,Free entry in 2 a wkly comp to win FA Cup fina...


In [9]:
pipe.predict_proba(my_df['message'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([[0.95931778, 0.04068222],
       [0.01745318, 0.98254682]])

In [10]:
pipe.classes_

array(['ham', 'spam'], dtype='<U4')