# SPAM-HAM classification of SMS and Emails<br>
<br>
Importing basic libraries to access and manage our Dataset<br>
Pandas used to import our data from tsv file into a dataframe

In [1]:
import numpy as np
import pandas as pd

## Importing Our Dataset

In [5]:
from sklearn.model_selection import train_test_split
dataset = pd.read_csv('smsspamcollection.tsv', sep='\t')

## Our dataset contains Label, Message , Length of String & Punctuations

In [6]:
dataset.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


## Dividing our data into Feature data and Label data
- We take 'length' and 'punct' as our feature data(X).
- Our label data(y) will be 'label'

In [9]:
X = dataset[['length','punct']]
y = dataset['label']

## Import train_test_split from sklearn.model_selection
- Dividing our data into training and test set.
- Taking 30% of our data into test set.

In [13]:
from sklearn.model_selection import train_test_split 

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 42)

## Training model using Logistic Regression 
- Importing LogisticRegression from sklearn.linear_model
- Fitting our training set to the model
- Using Lbfgs solver to handle multinomial problem
- Multinomial logistic regression is a classification method that generalizes logistic regression to multiclass problems, i.e. with two or more than two possible discrete outcomes.

In [15]:
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression(solver = "lbfgs")

In [19]:
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Using our test data set to predict outcomes from our model trained on training dataset
- Using our X_test variable to check predict the values of 'label'.

In [20]:
y_pred = model.predict(X_test)

- Creating a confusion metrics to calculate how our model performed in classifying the test data.
- importing metrics from sklearn

In [24]:
from sklearn import metrics

In [29]:
pd.DataFrame(metrics.confusion_matrix(y_test,y_pred), index = ['ham','spam'], columns=['ham','spam'])

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


### Now we'll calculate the precision, recall, f1-score , support and accuracy of our model

In [31]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [36]:
print(f"Acuracy Score : {metrics.accuracy_score(y_test,y_pred).round(4) * 100}%" )

Acuracy Score : 84.27%


## Now using Navie Bayes Multinomial Model to see how it performs

In [40]:
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
y_pred = model_nb.predict(X_test)
pd.DataFrame(metrics.confusion_matrix(y_test,y_pred), index = ['ham','spam'], columns=['ham','spam'])

Unnamed: 0,ham,spam
ham,1438,10
spam,224,0


In [43]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [44]:
print(f"Acuracy Score : {metrics.accuracy_score(y_test,y_pred).round(4) * 100}%" )

Acuracy Score : 86.0%


## Now using SVM to see how it performs

In [49]:
from sklearn.svm import SVC

In [50]:
model_svc = SVC(gamma='auto')
model_svc.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
y_pred = model.predict(X_test)
pd.DataFrame(metrics.confusion_matrix(y_test,y_pred), index = ['ham','spam'], columns=['ham','spam'])

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [53]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [54]:
print(f"Acuracy Score : {metrics.accuracy_score(y_test,y_pred).round(4) * 100}%" )

Acuracy Score : 84.27%


# Problem with this Classification

- Applying these models dosen't give us an accurate prediction
- It classifies the data on the basis of length and punctations in the text which can give us false positives because an message can be a HAM and still be formal as seen in the models above

## Solution
- In order to solve the problem we use text classification to extract features from the text in order to classify it better.

# Using Term Frequency Inverse Document Frequency (TD-IDF)

- Basically what it does covert our text to matrix form in order to extract features from it and input that matrix into out machine learning model
- This functions count the frequnecy of each word in the text and inverses it : The reseaon to inverse the term freqeuncy is to diminish the weights of the words that occur too freqeuntly such as 'a','the' in the document and increase the weight of the terms that occur rarely.
- TD-IDF helps us understand the significance of a word in an entire corpus rather than relative to a document.

## Creating new X,y for Text Classification from our previous dataset

In [55]:
X = dataset['message']
y = dataset ['label']

## Splitting our data into Test and Training sets

In [56]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 42)

## We'll create a pipeline
- This will covert our entire text to TF-IDF in Vectorised format
- Then it will perform Linear SVC on our trainig set(X_train)

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [62]:
text_clf = Pipeline([('text_idf',TfidfVectorizer()), ('clf',LinearSVC())])

In [64]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('text_idf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [65]:
y_pred = text_clf.predict(X_test)

In [66]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [67]:
print(f"Acuracy Score : {metrics.accuracy_score(y_test,y_pred).round(4) * 100}%" )

Acuracy Score : 99.22%


## Using this model helped us achive a much better accuracy using the power of text extraction with the help of TF-IDF

## Contributing
Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.

Please make sure to update tests as appropriate.