# Text Classification 

 - Predict whether mail is Spam or Ham

### Importing Necessary Libraries

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

### Loading the Dataset

In [13]:
data = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\Project for jobs\NLP-email(spam,ham)\spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
data.loc[2,'Message']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [15]:
data.shape

(5572, 2)

In [16]:
data.loc[0,'Message']

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

### Data Preprocessing

In [18]:
# Download and prepare stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [23]:

# Tokenization and text cleaning
data['Message'] = data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))



In [24]:
data.loc[0,'Message']

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [25]:
# Stop words removal
data['Message'] = data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [26]:
data.loc[0,'Message']

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [27]:
data.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah think goes usf lives around though


In [28]:
data.loc[0,'Message']

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

### Feature Extraction

In [29]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Message'])

In [30]:
myX = vectorizer.fit(data['Message'])

In [31]:
myX1 = vectorizer.transform(data['Message'])

In [32]:
myX1.shape

(5572, 7198)

In [33]:
print(X.shape)

(5572, 7198)


In [34]:
df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names_out())

In [35]:
df.loc[0,:]

aa             0.0
aah            0.0
aaniye         0.0
aaooooright    0.0
aathi          0.0
              ... 
zoe            0.0
zogtorius      0.0
zoom           0.0
zouk           0.0
zyada          0.0
Name: 0, Length: 7198, dtype: float64

In [36]:
df.loc[0,:][['jurong','point']]

jurong    0.346503
point     0.271019
Name: 0, dtype: float64

### Split the Data into Training and Testing Sets

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=42)

### Build and Train the Model

In [38]:
model = MultinomialNB()
model.fit(X_train, y_train)

### Model Evaluation

In [39]:
y_pred = model.predict(X_test)

In [40]:
print(y_pred[:5], y_test[:5])

['ham' 'ham' 'ham' 'ham' 'ham'] 3245    ham
944     ham
1044    ham
2484    ham
812     ham
Name: Category, dtype: object


In [41]:
y_test.value_counts()

ham     966
spam    149
Name: Category, dtype: int64

### Print the Results

In [42]:
print("Accuracy:", round(accuracy_score(y_test, y_pred)*100,2),'%')
print(classification_report(y_test, y_pred))

Accuracy: 97.13 %
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [43]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[966   0]
 [ 32 117]]


In [44]:
new_data = pd.DataFrame([['Hi, today is a good day.  So, I will come and meet you 5 pm.'],\
                         ["A free entry exhibition attractive xyz kdjfkla lalkkj"],\
                         ['free entry wkly comp win fa cup final tkts may text fa receive entry question std txt rate c apply']], \
                        columns = ['Message'])

In [45]:
new_data

Unnamed: 0,Message
0,"Hi, today is a good day. So, I will come and ..."
1,A free entry exhibition attractive xyz kdjfkla...
2,free entry wkly comp win fa cup final tkts may...


In [46]:
new_data['Message'] = new_data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))
new_data['Message'] = new_data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [47]:
new_data

Unnamed: 0,Message
0,hi today good day come meet pm
1,free entry exhibition attractive xyz kdjfkla l...
2,free entry wkly comp win fa cup final tkts may...


In [48]:
X1 = vectorizer.transform(new_data['Message'])

In [49]:
X1.shape

(3, 7198)

In [50]:
model.predict(X1)

array(['ham', 'spam', 'spam'], dtype='<U4')