In [1]:
## Importing Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Load the Data from CSV with specified encoding
df = pd.read_csv('spam.csv', encoding='ISO-8859-1').iloc[:,0:2]

## Check the DataFrame
print("DataFrame Head:")
df.head()

DataFrame Head:


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
## Applying Count Vectorizer
countdf_vectorizer = CountVectorizer(stop_words='english', 
                                     token_pattern=r'\b[a-zA-Z]+\b')  # Only keep alphabetic words

# Fit and transform the message column (v2) to create the TF-IDF matrix
countdf_matrix = countdf_vectorizer.fit_transform(df['v2'])

# Convert the TF-IDF matrix to a DataFrame for better visualization
count_df = pd.DataFrame(countdf_matrix.toarray(), columns=countdf_vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
print("\nCount Vector DataFrame:")
print(count_df)

## Displaying the feature names
print("\nFeature names (words):")
print(countdf_vectorizer.get_feature_names_out())


Count Vector DataFrame:
      aa  aah  aaniye  aaooooright  aathi  ab  abbey  abdomen  abeg  abel  \
0      0    0       0            0      0   0      0        0     0     0   
1      0    0       0            0      0   0      0        0     0     0   
2      0    0       0            0      0   0      0        0     0     0   
3      0    0       0            0      0   0      0        0     0     0   
4      0    0       0            0      0   0      0        0     0     0   
...   ..  ...     ...          ...    ...  ..    ...      ...   ...   ...   
5567   0    0       0            0      0   0      0        0     0     0   
5568   0    0       0            0      0   0      0        0     0     0   
5569   0    0       0            0      0   0      0        0     0     0   
5570   0    0       0            0      0   0      0        0     0     0   
5571   0    0       0            0      0   0      0        0     0     0   

      ...  zebra  zed  zeros  zhong  zindgi  zoe  

In [4]:
## Applying TF-IDF Vectorizer
# Initialize the TF-IDF vectorizer, removing stopwords and numbers
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                     token_pattern=r'\b[a-zA-Z]+\b')  # Only keep alphabetic words

# Fit and transform the message column (v2) to create the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['v2'])

# Convert the TF-IDF matrix to a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
print("\nTF-IDF DataFrame:")
print(tfidf_df)

## Displaying the feature names
print("\nFeature names (words):")
print(tfidf_vectorizer.get_feature_names_out())


TF-IDF DataFrame:
       aa  aah  aaniye  aaooooright  aathi   ab  abbey  abdomen  abeg  abel  \
0     0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
1     0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
2     0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
3     0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
4     0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
...   ...  ...     ...          ...    ...  ...    ...      ...   ...   ...   
5567  0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
5568  0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
5569  0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
5570  0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   
5571  0.0  0.0     0.0          0.0    0.0  0.0    0.0      0.0   0.0   0.0   

      ...  zebra  zed  zeros  zh

In [13]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# Encode target variable
df['v1'] =  df['v1'].map({'ham':0,'spam':1})


In [6]:
# Separate x and y
x = count_df
y = df['v1']
# split the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

In [7]:
# build the model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

In [8]:
# Evaluate the model
from sklearn.metrics import classification_report
# get predictions for train and test
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
print("____________Train Report_________________")
print(classification_report(y_train,y_train_pred))
print("____________Test Report_________________")
print(classification_report(y_test,y_test_pred))

____________Train Report_________________
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3629
           1       0.97      0.96      0.97       550

    accuracy                           0.99      4179
   macro avg       0.98      0.98      0.98      4179
weighted avg       0.99      0.99      0.99      4179

____________Test Report_________________
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1196
           1       0.89      0.94      0.92       197

    accuracy                           0.98      1393
   macro avg       0.94      0.96      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [9]:
# Get prediction
new_data = pd.DataFrame({"Message":["Hi, We need to handle this situation urgently.","Free Tickets!"]})

In [10]:
new_data

Unnamed: 0,Message
0,"Hi, We need to handle this situation urgently."
1,Free Tickets!


In [12]:
new_data_cv = countdf_vectorizer.transform(new_data['Message'])

In [13]:
model.predict(new_data_cv)



array([0, 1], dtype=int64)

In [14]:
tfidf_df # Gaussiannb, KNN

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
x="abc"

In [28]:
x.isdigit()==False

True

In [31]:
def clean_text(text):
    text = ''.join([char for char in text if char not in '#,.*@$' and char.isdigit()==False])
    ct = ' '.join([word.lower() for word in text.split() if len(word)>2])
    return ct

In [32]:
clean_text("Hi, How are you? A reminder for the session.")

'how are you? reminder for the session'

In [33]:
df['v2'].apply(clean_text)

0       until jurong point crazy available only bugis ...
1                                      lar joking wif oni
2       free entry wkly comp win cup final tkts may te...
3                      dun say early hor already then say
4       nah don't think goes usf lives around here though
                              ...                        
5567    this the time have tried contact have won the ...
5568                           will going esplanade home?
5569      pity was mood for that soany other suggestions?
5570    the guy did some bitching but acted like i'd i...
5571                               rofl its true its name
Name: v2, Length: 5572, dtype: object