In [14]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer # help to convert text into numbers

from sklearn.model_selection import train_test_split # for splitting dataset

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report


In [4]:
# Load dataset and see first 5 row in dataset.
df=pd.read_csv('SMSSpamCollection',sep='\t',header=None,names=['label','text'],encoding='latin-1')
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Since Machine learning model does not read text directly, we need to convert text into numbers by using **TF-IDF,CountVectorizer and Bag of words**

In [6]:
# Instantiate converter
vectorizer=TfidfVectorizer(stop_words='english')

# Make feature matrix by signing it into X and fit vectorizer.
X=vectorizer.fit_transform(df['text'])


In [12]:
# Convert label into dummy value
y=df['label'].map({'ham':0,'spam':1})

y.shape

(5572,)

The most common models used  for spam are the following **Naive Bayes(best for text),Logistic regression,SVM,Random Forest** but for me I am going to apply **Naive Bayes and Logistic regression**

# Splitting feature matrix X and target vector y

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# Train models

In [16]:
# Instantiate models
Naive_Bayes=MultinomialNB()
Logistic_Regression=LogisticRegression()


In [44]:
def model_Train(model,X_train,y_train,name='Naive bayes'):
  if name == 'Naive':
    model=model.fit(X_train,y_train)
    print('Fit Naive bayes is successful')
  
  elif name == 'Logistic':
    model=model.fit(X_train,y_train)
    
    print('fit Logistic is successful')
    

  else:
    print('There is no specific model defined')
  return model
  
  


# Evaluating the model

In [68]:
def evaluation_model(model,X_test,y_test,model_name='Logistic'):
  
  if model_name== 'Logistic':
    y_preds=model.predict(X_test)

    print('Training Accuracy for Logistic',round(model.score(X_train,y_train),3))

    print('The accuracy of Logistic:' ,round(accuracy_score(y_test,y_preds),3))

    print('The report of Logistic :',classification_report(y_test,y_preds))
    
  elif model_name == 'Naive':
    y_preds=model.predict(X_test)

    print('Training Accuracy for Naive',round(model.score(X_train,y_train),3))

    print('The accuracy of Naive Bayes:' ,round(accuracy_score(y_test,y_preds),3))
    print('The report of Naive Bayes :',classification_report(y_test,y_preds))
    

# Calling functions

### 1. For Naive Bayes both train and evaluation 

In [69]:
model1=model_Train(Naive_Bayes,X_train,y_train,name='Naive')

Fit Naive bayes is successful


In [70]:
evaluation_model(model1,X_test,y_test,model_name='Naive')



Training Accuracy for Naive 0.979
The accuracy of Naive Bayes: 0.979
The report of Naive Bayes :               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



### 2. For Logistic Regression both train and evaluation 

In [71]:
model2=model_Train(Logistic_Regression,X_train,y_train,name='Logistic')

fit Logistic is successful


In [72]:
evaluation_model(model2,X_test,y_test,model_name='Logistic')

Training Accuracy for Logistic 0.968
The accuracy of Logistic: 0.959
The report of Logistic :               precision    recall  f1-score   support

           0       0.95      1.00      0.98       966
           1       1.00      0.69      0.82       149

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



#### Let test the classifier with my own message

In [75]:
msg=['Congratulations! You have won 1,000,000 dollars!']
msg_vec=vectorizer.transform(msg)
print('Prediction for Naive bayes: ',model1.predict(msg_vec))

print('Prediction for Logistic: ',model2.predict(msg_vec))

Prediction for Naive bayes:  [1]
Prediction for Logistic:  [0]


In [76]:
email=['Dear Prof. Regis, I am thrilled to tell you that you have done thesis research well. So, you are invited to take your award this friday at college headquarter. Thank you. Your Sincerely. BIZUMUREMYI']

email_vec=vectorizer.transform(email)

print('Prediction for Naive bayes: ',model1.predict(email_vec))

print('Prediction for Logistic: ',model2.predict(email_vec))


Prediction for Naive bayes:  [0]
Prediction for Logistic:  [0]
