# SMS Spam Detection

## Import the libraries

In [12]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

## Import the dataset

In [13]:
data = pd.read_csv("spam.csv", encoding='latin1')
data = data[['v1','v2']]

In [14]:
data.rename(columns={'v1': 'Label', 'v2': 'Messages'}, inplace=True)

## Pre processing

In [15]:
data.isnull().sum()

Label       0
Messages    0
dtype: int64

In [16]:
Stop = set(stopwords.words('english'))
def Clean_Text(text):
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = " ".join(word for word in text.split() if not word in Stop)
    return text

In [17]:
data['Cleaned_Text'] = data['Messages'].apply(Clean_Text)

In [18]:
data = data[['Messages','Cleaned_Text','Label']]

In [19]:
data

Unnamed: 0,Messages,Cleaned_Text,Label
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,ham
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,spam
3,U dun say so early hor... U c already then say...,u dun say early hor u c already say,ham
4,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,ham
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u 750 pound prize 2...,spam
5568,Will Ì_ b going to esplanade fr home?,b going esplanade fr home,ham
5569,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,ham
5570,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...,ham


## Training the model

In [20]:
X = data['Cleaned_Text']
y = data['Label']

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def Classifier(model,X,y):
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.25, shuffle = True, stratify = y)
    Pipeline_model = Pipeline([('vect', CountVectorizer()),
                               ('tfidf', TfidfTransformer()),
                                ('clf', model)])
    Pipeline_model.fit(X_train,y_train)
    print("Accuracy: ", Pipeline_model.score(X_test,y_test)*100)
    
    y_pred = Pipeline_model.predict(X_test)
    print(classification_report(y_test,y_pred))

In [22]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
Classifier(model, X, y)

Accuracy:  97.12849964106246
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       1.00      0.79      0.88       187

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

