In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv',encoding='latin1') 

In [None]:
df.sample(5)

In [None]:
df.shape

#1.Data cleaning

## df.info()

In [None]:
#drop last 3 col
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
#renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
#missing values
df.isnull().sum()

In [None]:
#checking for duplicated values
df.duplicated().sum()

In [None]:
#remove duplicates
df=df.drop_duplicates(keep='first')

In [None]:
df.shape

#2.EDA

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")

In [None]:
#Data is imbalanced

In [None]:
import nltk

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters']=df['text'].apply(len)

In [None]:
#number of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
df=df.drop(columns=['text'])

In [None]:
sns.heatmap(df.corr(),annot=True)

#3.Data Preprocessing
##Lower Case
##Tokenization(breaking in words)
##Removing Special Characters
##removing stop words and punctuation
##Stemming

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
import string
string.punctuation

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [None]:
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)

    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text=y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [None]:
print(df.columns)

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
wc

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=""))
spam_wc

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(5,4))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=""))
plt.figure(figsize=(5,4))
plt.imshow(ham_wc)

In [None]:
spam_corpus=[]
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
import seaborn as sns
import pandas as pd
from collections import Counter
spam_df = pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['word', 'count'])
# Correct way to use sns.barplot:
sns.barplot(x='word', y='count', data=spam_df)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus=[]
for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
ham_df = pd.DataFrame(Counter(ham_corpus).most_common(30), columns=['word', 'count'])
# Correct way to use sns.barplot:
sns.barplot(x='word', y='count', data=ham_df)
plt.xticks(rotation='vertical')
plt.show()

4.  MODEL BUILDING

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer()

In [None]:
X=cv.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape

In [None]:
y=df['target'].values
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
X=tfidf.fit_transform(df['transformed_text']).toarray()
X.shape
y=df['target'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
#tfidf-->MNB (as precision is 1 that is true negative is 0,accuracy is not reliable as data is imbalanced)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
mnb=MultinomialNB()
lrc=LogisticRegression(solver='liblinear',penalty='l1')
dtc=DecisionTreeClassifier(max_depth=5)
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
knc=KNeighborsClassifier()

In [None]:
clfs={
    'NB':mnb,
    'LR':lrc,
    'DT':dtc,
    'RF':rfc,
    'KN':knc
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores=[]
precision_scores=[]
for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    
    print("For",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
performance_df1=pd.melt(performance_df,id_vars="Algorithm")

In [None]:
sns.catplot(x='Algorithm',y='value',hue='variable',data=performance_df1,kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#model improvement
# 1.Change the max_features parameters of TfIdf

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer(max_features=3000)
X=tfidf.fit_transform(df['transformed_text']).toarray()
y=df['target'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
mnb=MultinomialNB()
lrc=LogisticRegression(solver='liblinear',penalty='l1')
dtc=DecisionTreeClassifier(max_depth=5)
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
knc=KNeighborsClassifier()
clfs={
    'NB':mnb,
    'LR':lrc,
    'DT':dtc,
    'RF':rfc,
    'KN':knc
}
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores=[]
precision_scores=[]
for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    
    print("For",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
temp_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores})

In [None]:
new_df=performance_df.merge(temp_df,on='Algorithm')
new_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=3000)
X=tfidf.fit_transform(df['transformed_text']).toarray()
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X=scaler.fit_transform(X)
#appending the num_character col to X
#X=np.hstack(X,df['num_characters'].values.reshape(-1,1))
y=df['target'].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
mnb=MultinomialNB()
lrc=LogisticRegression(solver='liblinear',penalty='l1')
dtc=DecisionTreeClassifier(max_depth=5)
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
knc=KNeighborsClassifier()
clfs={
    'NB':mnb,
    'LR':lrc,
    'DT':dtc,
    'RF':rfc,
    'KN':knc
}
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    
    return accuracy,precision
accuracy_scores=[]
precision_scores=[]
for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf,X_train,y_train,X_test,y_test)
    
    print("For",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
temp_df=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores})
new_df_scaled=new_df.merge(temp_df,on='Algorithm')
new_df_scaled