In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df= pd.read_csv('spam.tsv', delimiter='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   length   5572 non-null   int64 
 3   punct    5572 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 174.2+ KB


We are determining what emails are spam or not spam(ham)

In [17]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [18]:
df.tail()

Unnamed: 0,label,message,length,punct
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1
5571,ham,Rofl. Its true to its name,26,1


In [19]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [20]:
df['label'].value_counts()/len(df)
#percentage of emails , the counts of each email divided by total dataframe

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [23]:
#make the shape of spam and ham equal

In [24]:
ham =df[df['label']=='ham']
spam= df[df['label']=='spam']

In [29]:
ham.shape

(4825, 4)

In [30]:
spam.shape

(747, 4)

In [31]:
#this is cahning the shape of the data, now they both have the same number of reccords
ham = ham.sample(spam.shape[0])

In [32]:
ham.shape, spam.shape

((747, 4), (747, 4))

In [33]:
data= ham.append(spam , ignore_index= True)
#this combines the data together

  data= ham.append(spam , ignore_index= True)


In [34]:
data

Unnamed: 0,label,message,length,punct
0,ham,"Since when, which side, any fever, any vomitin.",47,4
1,ham,I think that tantrum's finished so yeah I'll b...,64,2
2,ham,U GOIN OUT 2NITE?,17,1
3,ham,is your hamster dead? Hey so tmr i meet you at...,64,2
4,ham,My slave! I want you to take 2 or 3 pictures o...,112,3
...,...,...,...,...
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...,160,8
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...,147,3


In [35]:
from sklearn.model_selection import train_test_split


In [36]:
from sklearn.utils import shuffle
X_train, X_test, y_train, y_test = train_test_split(data["message"], data["label"], test_size=0.30, random_state=0, shuffle= True)

#message is x and label is y
#This function will be used to shuffle (randomly reorder) your dataset. Shuffling the data helps ensure that the training and testing sets are representative of the overall dataset and reduces the risk of any inherent ordering bias in the dataset.

In [41]:
#applying random forrest

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

#TfidfVectorizer is a tool for converting a collection of raw text documents into a numerical format that can be used for machine learning.
#t stands for "Term Frequency-Inverse Document Frequency," and it is a technique to quantify the importance of words in a document relative to a collection of documents.
#Pipeline is a powerful feature in scikit-learn that allows you to define a sequence of data processing steps.

#  TfidfVectorizer will help convert text data into a numerical format, the RandomForestClassifier will be your machine learning model,
#  and the Pipeline will help you put everything together into a streamlined workflow.

# Creating object of class pipeline

In [44]:
classifier = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("classifier", RandomForestClassifier(n_estimators=100))
])
#This pipeline combines text feature extraction with a random forest classifier.
#By creating this pipeline, you can easily apply these two steps in sequence without having to manually preprocess the data and apply the classifier separately

# The first stage of the pipeline is named "tfidf," and it's associated with the TfidfVectorizer() object.
# This step is responsible for converting text data into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).

# The second stage is named "classifier," and it's associated with the RandomForestClassifier object.
#  This is your machine learning model. It's a random forest classifier with 100 decision trees.

In [45]:
classifier.fit(X_train, y_train)
# the pipeline takes your email messages (in X_train), applies the TF-IDF transformation to convert them into numerical features,
# and then trains the random forest classifier using these features and the corresponding labels (in y_train).

#Once the model is trained, you can use it to make predictions on new, unseen email data to classify them as spam or ham.

In [46]:
y_pred= classifier.predict(X_test)
#contain the predicted labels for the test data

In [48]:
y_pred, y_test

(array(['spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham',
        'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam',
        'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam',
        'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham',
        'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham',
        'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam',
        'spam', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'spam',
        'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham',
        'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham',
        'ham', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam',
        'spam', 'ham', 'spam'

In [49]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [50]:
accuracy_score(y_pred, y_test)
#this is a good score

0.9532293986636972

In [51]:
confusion_matrix(y_pred, y_test)

array([[224,  18],
       [  3, 204]])

In [52]:
print(classification_report(y_pred, y_test))
#precision: This metric measures the accuracy of positive predictions.
#recall: Recall, also known as sensitivity or true positive rate, measures how well the model identifies all positive instances. In this context, it indicates how many of the actual spam emails were correctly identified as spam, and how many regular emails were correctly identified as regular.
#f1-score: The F1-score is the harmonic mean of precision and recall.
#support: This shows the number of occurrences of each class in the testing dataset. In this case, there are 242 "ham" emails and 207 "spam" emails.


              precision    recall  f1-score   support

         ham       0.99      0.93      0.96       242
        spam       0.92      0.99      0.95       207

    accuracy                           0.95       449
   macro avg       0.95      0.96      0.95       449
weighted avg       0.96      0.95      0.95       449



In [58]:
test1 = ['Hello, You are learning natural Language Processing']
test2 = ['Hope you are doing good and learning new things !']
test3 = ['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677']
test4= ['an urgent question for you, did you receive an email from us last week about a special offer']
test5= ['You are great, keep up the good job!']

In [59]:
print(classifier.predict(test1))
print(classifier.predict(test2))
print(classifier.predict(test3))
print(classifier.predict(test4))
print(classifier.predict(test5))

['ham']
['ham']
['spam']
['spam']
['ham']
