# **Importing Modules**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes  import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

# **Uploading the dataset**

In [None]:
data = pd.read_csv(r'/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')
data.head()

## **Cleaning the dataset**

In [None]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
data.columns = ['label','messages']

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.groupby('label').describe()

# **Using Tfidf Vectorizer**

TFIDF is another way to convert textual data to numeric form, and is short for Term Frequency-Inverse Document Frequency. The vector value it yields is the product of these two terms; TF and IDF.

More about TfidfVectorizer :
https://en.wikipedia.org/wiki/Tf%E2%80%93idf

Implementation :
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
vect = TfidfVectorizer()
x = vect.fit_transform(data['messages'].tolist())
print(len(vect.get_feature_names()))
print(vect.get_feature_names()[1000:1010])
print(x.toarray()[:5])
x = x.toarray()

In [None]:
y = data['label'].tolist()
y = np.array(y)

In [None]:
print("Output Variable :",y.shape)
print("Input Variable :",x.shape)

# **Splitting data into training(%70) and testing(%30)**

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=10,test_size=0.3)

# **Using Naive Bayes Classifier**

It is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. In simple terms, a Naive Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.

In [None]:
model = BernoulliNB()
model.fit(x_train,y_train)
prediction = model.predict(x_test)
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

## **Tuning parameters**

In [None]:
model = BernoulliNB(alpha=0.74)
model.fit(x_train,y_train)
prediction = model.predict(x_test)
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

In [None]:
print(pd.crosstab(y_test,prediction))

# **Checking for spam sms**

In [None]:
sms = ["Congrats You have won a price in the xyz competion, click on the link to claim it  www.abc.advertise.com/ "]
spam = vect.transform(sms)
print(model.predict(spam))

### #beginnerscode &emsp; #codingisfun &emsp; #datasciencewithpy