In [85]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

ps= PorterStemmer()
df= pd.read_csv('spam.csv')
df.sample(5)


#1. Data Cleaning

#drop last 3 columns since most values are null
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
#renaming the columns and we specify the names using a dictionary
df.rename(columns={'v1':'target','v2':'text'},inplace=True)

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])
df.head()
#check for missing values
df.isnull().sum()
#check for duplicate values
df.duplicated().sum()
#remove duplicate values
df = df.drop_duplicates(keep = 'first')

df.duplicated().sum()

#2. EDA: Exploratory Data Analysis
df['target'].value_counts()

#plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")

#results of pie chart show that 87% sms are not spam and the rest 13% are spam
#Data is unbalanced since ham >spam
#plt.show()

df['num_characters']=df['text'].apply(len)
df.head()
#num of words 
#nltk.download('punkt')
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

df[['num_characters','num_words','num_sentences']].describe()

#describe method on ham messages
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

#describe method on spam messages
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

#sns.heatmap(df.corr(),annot=True)

#3. Data preprocessing
##(i) Lower case
##(ii) Tokenization
##(iii) Removing Special Characters
##(iv) Removing stop words and punctuations
##(v) Stemming
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

df['transformed_text'] = df['text'].apply(transform_text)
wc= WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc = wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
# plt.imshow(spam_wc)
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
# plt.imshow(ham_wc)
df.head()

spam_corpus=[]
for msg in df[df['target']== 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
len(spam_corpus)

ham_corpus=[]
for msg in df[df['target']== 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)
len(ham_corpus)

#4. Model Building
#Naive based algorithm
cv = CountVectorizer
tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['transformed_text']).toarray()

#appending the num_character col to X
#X = np.hstack((X,df['num_characters'].values.reshape(-1,1)))

#X = cv.fit_transform(df['transformed_text']).toarray()


y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)
#gnb = GaussianNB()
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2))

#bnb = BernoulliNB()

# gnb.fit(X_train,y_train)
# y_pred1 = gnb.predict(X_test)
# print(accuracy_score(y_test,y_pred1))
# print(confusion_matrix(y_test,y_pred1))
# print(precision_score(y_test,y_pred1))


# bnb.fit(X_train,y_train)
# y_pred3 = bnb.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred3))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred3))
# print("Precision:", precision_score(y_test, y_pred3))


svc = SVC(kernel='sigmoid', gamma=1.0)
#mnb = MultinomialNB()
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)


clfs = {
  'SVC' : svc,
  'NB': mnb, 
  'RF': rfc,
  'ETC': etc
}

def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)
new_df = performance_df.merge(temp_df,on='Algorithm')
new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)
new_df_scaled.merge(temp_df,on='Algorithm')

# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
#mnb = MultinomialNB()
#etc = ExtraTreesClassifier(n_estimators=50, random_state=2)


voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

voting.fit(X_train, y_train)

y_pred= voting.predict(X_test)

#Applying Stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()
clf= StackingClassifier(estimators=estimators,final_estimator=final_estimator)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))


pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))


print(X_train.shape)
print(X_test.shape)


#5. Evaluation
#6. Improvements
#7. Website 

Accuracy: 0.971953578336557
Confusion Matrix:
 [[896   0]
 [ 29 109]]
Precision: 1.0
For  SVC
Accuracy -  0.9758220502901354
Precision -  0.9747899159663865
For  NB
Accuracy -  0.971953578336557
Precision -  1.0
For  RF
Accuracy -  0.9748549323017408
Precision -  0.9827586206896551
For  ETC
Accuracy -  0.9796905222437138
Precision -  0.975609756097561
Accuracy 0.9806576402321083
Precision 0.946969696969697
(4135, 3000)
(1034, 3000)


In [None]:
!pip install --upgrade scikit-learn


In [74]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-win_amd64.whl (8.4 MB)
     ---------------------------------------- 8.4/8.4 MB 10.1 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     ------------------------------------- 298.0/298.0 kB 18.0 MB/s eta 0:00:00
Installing collected packages: joblib, scikit-learn


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Sharmi Dev Gupta\\AppData\\Local\\Programs\\Python\\Python39\\Lib\\site-packages\\~klearn\\.libs\\vcomp140.dll'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.0.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


  Attempting uninstall: joblib
    Found existing installation: joblib 1.0.1
    Uninstalling joblib-1.0.1:
      Successfully uninstalled joblib-1.0.1
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2


In [42]:
!pip install wordcloud


Collecting wordcloud
  Downloading wordcloud-1.8.2.2-cp39-cp39-win_amd64.whl (153 kB)
     -------------------------------------- 153.1/153.1 kB 2.3 MB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.2.2



[notice] A new release of pip is available: 23.0.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Sharmi Dev
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
!pip install seaborn


Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
     -------------------------------------- 293.3/293.3 kB 1.2 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2



[notice] A new release of pip is available: 23.0.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 4.8 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
     ---------------------------------------- 77.1/77.1 kB ? eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2023.3.23-cp39-cp39-win_amd64.whl (267 kB)
     ------------------------------------- 268.0/268.0 kB 17.2 MB/s eta 0:00:00
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.1 regex-2023.3.23 tqdm-4.65.0



[notice] A new release of pip is available: 23.0.1 -> 23.1
[notice] To update, run: python.exe -m pip install --upgrade pip


nltk.download('punkt')