In [2]:
import pandas as pd
import numpy as np
import re 
import nltk
import sklearn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
print(dir(sklearn.model_selection))

['BaseCrossValidator', 'BaseShuffleSplit', 'FixedThresholdClassifier', 'GridSearchCV', 'GroupKFold', 'GroupShuffleSplit', 'KFold', 'LearningCurveDisplay', 'LeaveOneGroupOut', 'LeaveOneOut', 'LeavePGroupsOut', 'LeavePOut', 'ParameterGrid', 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', 'RepeatedKFold', 'RepeatedStratifiedKFold', 'ShuffleSplit', 'StratifiedGroupKFold', 'StratifiedKFold', 'StratifiedShuffleSplit', 'TimeSeriesSplit', 'TunedThresholdClassifierCV', 'ValidationCurveDisplay', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__getattr__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_classification_threshold', '_plot', '_search', '_split', '_validation', 'check_cv', 'cross_val_predict', 'cross_val_score', 'cross_validate', 'learning_curve', 'permutation_test_score', 'train_test_split', 'typing', 'validation_curve']


In [4]:
data=pd.read_csv("D:/dataset/Projects/spam.csv")
data.dtypes

Category    object
Message     object
dtype: object

In [5]:
data=pd.read_csv("D:/dataset/Projects/spam.csv")
data

# data=pd.read_csv("D:/dataset/Projects/spam.csv",sep='\t',names=['label','Msg'])

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
data["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [9]:
data["Message"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
corpus=[]
lm= WordNetLemmatizer()
for i in range (len(data)):
    review =re.sub('^a-zA-Z0-9',' ',data['Message'][i])
    review = review.lower()
    review = review.split()
    review = [x for x in review if x not in stopwords.words('english')]
    review = [lm.lemmatize(x) for x in review]
    review =" ".join(review)
    corpus.append(review)

In [11]:
print(type(data))  # Should return <class 'pandas.DataFrame'>


<class 'pandas.core.frame.DataFrame'>


In [12]:
len(data["Message"])

5572

In [13]:
len(corpus)

5572

In [14]:
data["Message"]=corpus

In [15]:
data.head()

Unnamed: 0,Category,Message
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


# model building

### data spliting

In [18]:
x=data["Message"]
y=data["Category"]

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=10)

In [20]:
len(x_train), len(y_train)

(3900, 3900)

In [21]:
len(x_test), len(y_test)

(1672, 1672)

### Vectorization (convert text data into the vectors)

In [23]:
tf_obj= TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
x_train_tfidf.shape

(3900, 6925)

### pipeline 

In [26]:
text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [27]:
text_mnb.fit(x_train,y_train)

In [28]:
y_pred_test=text_mnb.predict(x_test)
print("Accuract of model =",accuracy_score(y_test,y_pred_test)*100)

Accuract of model = 95.8732057416268


In [29]:
y_pred_train=text_mnb.predict(x_train)
print("Accuracy Score :",accuracy_score(y_train,y_pred_train)*100)

Accuracy Score : 98.23076923076923


In [30]:
y_pred_test=text_mnb.predict(x_test)
print("confusion_matrix of model =\n",confusion_matrix(y_test,y_pred_test))

confusion_matrix of model =
 [[1457    0]
 [  69  146]]


In [31]:
y_pred_train=text_mnb.predict(x_train)
print("confusion_matrix :\n",confusion_matrix(y_train,y_pred_train))

confusion_matrix :
 [[3368    0]
 [  69  463]]


In [32]:
y_pred_test=text_mnb.predict(x_test)
print("Classificatiuon report of model =\n",classification_report(y_test,y_pred_test))

Classificatiuon report of model =
               precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1457
        spam       1.00      0.68      0.81       215

    accuracy                           0.96      1672
   macro avg       0.98      0.84      0.89      1672
weighted avg       0.96      0.96      0.96      1672



In [33]:
y_pred_train=text_mnb.predict(x_train)
print("classification_report :",classification_report(y_train,y_pred_train))

classification_report :               precision    recall  f1-score   support

         ham       0.98      1.00      0.99      3368
        spam       1.00      0.87      0.93       532

    accuracy                           0.98      3900
   macro avg       0.99      0.94      0.96      3900
weighted avg       0.98      0.98      0.98      3900



### Prediction on User Data

In [35]:
def preprocessed_data(text):
    review =re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [x for x in review if x not in stopwords.words('english')]
    review = [lm.lemmatize(x) for x in review]
    review =" ".join(review)
    return [review]
    

In [36]:
user_data=data["Message"][0]
print(user_data)
user_data=preprocessed_data(user_data)
user_data

go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...


['go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...']

In [37]:
text_mnb.predict(user_data)[0]

'ham'

In [76]:
# flow-> data_preprocessing ->  vector(tfidf) --> multinomialNB()
class prediction:
    def __init__(self,x):
        self.x=x
    def user_data_processing(self):
        lm=WordNetLemmatizer()
        review =re.sub('^a-zA-Z0-9',' ',self.x)
        review = review.lower()
        review = review.split()
        review = [x for x in review if x not in stopwords.words('english')]
        review = [lm.lemmatize(x) for x in review]
        review =" ".join(review)
        return [review]

    def user_data_prediction(self):
        preprocessed_data = self.user_data_processing()

        if text_mnb.predict(preprocessed_data)[0] == "spam":
            return "This Message is Spam"

        else : 
            return " This Message is Not Spam"
        

        

In [78]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"
5,spam,freemsg hey darling 3 week's word back! i'd li...
6,ham,even brother like speak me. treat like aid pat...
7,ham,per request 'melle melle (oru minnaminunginte ...
8,spam,winner!! valued network customer selected rece...
9,spam,mobile 11 month more? u r entitled update late...


In [86]:
user_data=data['Message'][90]
print(user_data)
prediction(user_data).user_data_prediction()

yeah do! don‘t stand close tho- you‘ll catch something!


' This Message is Not Spam'