<a href="https://colab.research.google.com/github/SanghaviHarshPankajkumar/CODESOFT/blob/main/codesoft_task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Sms Classification

#### import SMS Spam collection dataset


In [1]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 111MB/s]


In [3]:
from zipfile import ZipFile
dataset = '/content/sms-spam-collection-dataset.zip'
with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('done...')

done...


#### importing necessary libraires

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

from sklearn.model_selection import train_test_split


In [5]:
spam_dataset = pd.read_csv('/content/spam.csv', encoding = "ISO-8859-1")
spam_dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing

In [24]:
df   = spam_dataset.drop( ['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace= False, axis =1)

In [25]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
#rename Columns
df.rename( {
    'v1': 'Class',
    'v2': 'Sms'
}, axis =1 , inplace = True)

#encode classes
df.replace({
    'spam':1,
    'ham':0
},inplace= True)

In [28]:
df.duplicated().sum()

403

In [29]:
df.drop_duplicates()

Unnamed: 0,Class,Sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [30]:
df['Class'].value_counts()

0    4825
1     747
Name: Class, dtype: int64

In [32]:
df.isna().sum()

Class    0
Sms      0
dtype: int64

In [33]:
X = df['Sms']
Y = df['Class']

### Natural Language Processing

In [34]:
nltk.download('stopwords')
nltk.download('punkt')
sw_list = stopwords.words('english')
word_stemmer = PorterStemmer()

def remove_tags(text):
  return re.sub(re.compile('<.*?>'), '', text)

def to_lower(text):
  return text.lower()

def remove_stop_words(text):
  word_tokens = word_tokenize(text)
  final_text  =" ".join([w for w in word_tokens if not w in sw_list])

  return final_text

def stemming(text):
  word_tokens = word_tokenize(text)
  final_text  =" ".join([word_stemmer.stem(w) for w in word_tokens])
  return final_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [39]:
X = X.apply(remove_tags)

In [36]:
X = X.apply(to_lower)

In [37]:
X = X.apply(remove_stop_words)

In [38]:
X = X.apply(stemming)

In [40]:
X.head()

0    go jurong point , crazi .. avail bugi n great ...
1                        ok lar ... joke wif u oni ...
2    free entri 2 wkli comp win fa cup final tkt 21...
3          u dun say earli hor ... u c alreadi say ...
4           nah n't think goe usf , live around though
Name: Sms, dtype: object

#### split in train and test

In [42]:
X_train, X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=33)

## Feature Extration

#### vectorize text data

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
cVec = CountVectorizer()

In [67]:
X_train_transform = cVec.fit_transform(X_train)
X_test_transform = cVec.transform(X_test)

In [68]:
X_train_transform.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

#### Apply tf-idf on vectorize data

In [50]:
from sklearn.feature_extraction.text import TfidfTransformer

In [69]:
Tf_idf = TfidfTransformer()

In [70]:
X_train_tfidf = Tf_idf.fit_transform(X_train_transform)
X_test_tfidf = Tf_idf.transform(X_test_transform)

In [71]:
X_train_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [72]:
X_train_tfidf.shape

(4457, 6541)

In [73]:
X_test_tfidf.shape

(1115, 6541)

## Model Creation and Evaluation

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix

### Logistic Regression


In [59]:
lr  = LogisticRegression()

In [75]:
lr.fit(X_train_tfidf,y_train)

In [76]:
y_pred_lr = lr.predict(X_test_tfidf)

In [84]:
print('Accuracy Score:-', accuracy_score(y_pred = y_pred_lr, y_true = y_test))
print('Confusion Matrix:-')
print(confusion_matrix(y_pred=y_pred_lr, y_true= y_test))

Accuracy Score:- 0.9614349775784753
Confusion Matrix:-
[[969   1]
 [ 42 103]]


#### Naive Bayes

In [85]:
nb = MultinomialNB()

In [86]:
nb.fit(X_train_tfidf,y_train)

In [87]:
y_pred_nb = nb.predict(X_test_tfidf)

In [88]:
print('Accuracy Score:-', accuracy_score(y_pred = y_pred_nb, y_true = y_test))
print('Confusion Matrix:-')
print(confusion_matrix(y_pred=y_pred_nb, y_true= y_test))

Accuracy Score:- 0.9632286995515695
Confusion Matrix:-
[[970   0]
 [ 41 104]]
