In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("data/spam_utf8.csv", encoding="ISO-8859-1")
df.to_csv("data/spam_utf8.csv", index=False, encoding="utf-8")

In [None]:
df.isnull().sum() 

In [None]:
df=df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1) #dropping as most of the last 3 have nulls only.

In [None]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head(2)

In [None]:
#label-encoding to transform target ham and spam into 0 and 1
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['target']=encoder.fit_transform(df['target'])

In [None]:
df=df.drop_duplicates(keep='first')
df['target'].value_counts()

In [None]:
#plotting for better understanding and visualisation
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.grid()
plt.show()

In [None]:
#measuring number of characters
def len_measurement(list_n):
    return len(list_n)
df['size_char']=df['text'].apply(len_measurement)
df.head(5)

In [None]:
#measuring number of words
import nltk
from nltk.tokenize import word_tokenize
def word_count(list_m):
    tokens= word_tokenize(list_m)
    return len(tokens)
df['words']=df['text'].apply(word_count)
df.head(5)

In [None]:
#measuring number of sentences
from nltk.tokenize import sent_tokenize
def sentence_count(list_m):
    tokens_sent=sent_tokenize(list_m)
    return len(tokens_sent)
df['sentences']=df['text'].apply(sentence_count)
df.head(5)

In [None]:
#ham
df[df['target']==0][['size_char','words','sentences']].describe()

In [None]:
#ham
df[df['target']==1][['size_char','words','sentences']].describe()

In [None]:
#so spam messages in general contain more characters,words and sentences.
#seaborn because it makes visualisation better.
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(df[df['target']==0]['size_char'],color='blue')
sns.histplot(df[df['target']==1]['size_char'],color='red')
plt.show()

In [None]:
sns.histplot(df[df['target']==0]['words'],color='blue')
sns.histplot(df[df['target']==1]['words'],color='red')
plt.show()

In [None]:
sns.pairplot(df,hue='target') #blue=ham,rest spam

In [None]:
# Selecting only numeric datas
import matplotlib.pyplot as plt

numeric_df = df.select_dtypes(include='number')
sns.heatmap(numeric_df.corr(), annot=True)
plt.show()

In [None]:
#data preprocessing(text hole evabe korbo):
#1)lower case
#2)tokenization
#3)removing special characters
#4)removing punctuations and stop words
#5)stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
import nltk, string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
punct = set(string.punctuation)

def transform_text(text):
    tokens = nltk.word_tokenize(text.lower())
    # filter alphanumeric, remove stopwords/punctuation, then stem
    return " ".join(
        ps.stem(tok)
        for tok in tokens
        if tok.isalnum() 
        and tok not in stop_words 
        and tok not in punct
    )

df['text'] = df['text'].apply(transform_text)
df.head(5)

In [None]:
df_list=df[df['target']==1]['text'].tolist()

In [None]:
spam_corpus=[]
for msg in df_list:
    for word in msg.split():
        spam_corpus.append(word)
len(spam_corpus)

In [None]:
from collections import Counter
data_spam=pd.DataFrame(Counter(spam_corpus).most_common(30))

In [None]:
data_spam

In [None]:
df_list2=df[df['target']==0]['text'].tolist()
ham_corpus=[]
for msg in df_list2:
    for word in msg.split():
        ham_corpus.append(word)
len(ham_corpus)

In [None]:
data_ham=pd.DataFrame(Counter(ham_corpus).most_common(30))

In [None]:
#Model Building
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vectorizer= CountVectorizer()
tfidfvec=TfidfVectorizer()

In [None]:
X=tfidfvec.fit_transform(df['text']).toarray()
X.shape

In [None]:
y=df['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
#using gnb
gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)
print(f"The accuracy of your model is={accuracy_score(y_test,y_pred)}")
print(f"The accuracy of your model is={confusion_matrix(y_test,y_pred)}")
print(f"The accuracy of your model is={precision_score(y_test,y_pred)}")

In [None]:
#using bnb
bnb.fit(X_train,y_train)
y_pred2=bnb.predict(X_test)
print(f"The accuracy of your model is={accuracy_score(y_test,y_pred2)}")
print(f"The accuracy of your model is={confusion_matrix(y_test,y_pred2)}")
print(f"The accuracy of your model is={precision_score(y_test,y_pred2)}")


In [None]:
#tfidf

gnb.fit(X_train,y_train)
y_pred=gnb.predict(X_test)
print(f"The accuracy of your model is={accuracy_score(y_test,y_pred)}")
print(f"The accuracy matrix is={confusion_matrix(y_test,y_pred)}")
print(f"The accuracy of the model is={precision_score(y_test,y_pred)}")

In [None]:
#bnb after tfidf
bnb.fit(X_train,y_train)
y_pred2=bnb.predict(X_test)
print(f"The accuracy of your model is={accuracy_score(y_test,y_pred2)}")
print(f"The matrix model is={confusion_matrix(y_test,y_pred2)}")
print(f"The accuracy of your model is={precision_score(y_test,y_pred2)}")


In [None]:
#mnb
mnb.fit(X_train,y_train)
y_pred3=mnb.predict(X_test)
print(f"The accuracy of model is={accuracy_score(y_test,y_pred3)}")
print(f"The accuracy matrix is={confusion_matrix(y_test,y_pred3)}")
print(f"The accuracy of model is={precision_score(y_test,y_pred3)}")

In [None]:
#from every experiment,it's best
tfidfvec=TfidfVectorizer(max_features=3000)
X=tfidfvec.fit_transform(df['text']).toarray()
y=df['target'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

mnb.fit(X_train,y_train)
y_pred3=mnb.predict(X_test)
print(f"The accuracy of model is={accuracy_score(y_test,y_pred3)}")
print(f"The accuracy matrix of model is={confusion_matrix(y_test,y_pred3)}")
print(f"The accuracy of my model is={precision_score(y_test,y_pred3)}")

In [None]:
import joblib

joblib.dump(mnb, "models/mnb.pkl")
joblib.dump(vectorizer, "models/vectorizer.pkl")
