In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
# steps -> Data Cleaning -> EDA -> Text Preprocessing -> Model Building -> Evaluation -> Improvement -> Website -> Deploy

## 1. Data Cleaning

In [None]:
df.info()

In [None]:
# so here as last three columns are null mostly so we drop them

In [None]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace = True)

In [None]:
df.sample(5)

In [None]:
# renaming the columns
df.rename(columns = {'v1': 'target','v2':'text'}, inplace = True)

In [None]:
df.sample(5)

In [None]:
# 9:41

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
encoder.fit_transform(df['target'])

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.sample(5)

In [None]:
# missing values
df.isnull().sum()

In [None]:
# check for duplicated values
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep = 'first')

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2. EDA(Exploratery data analysis)

In [None]:
df.value_counts()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.pie(df['target'].value_counts(), labels = ['ham','spam'], autopct = "%0.2f")
plt.show()

In [None]:
# in above data is imbalanced

In [None]:
# nltk is natural language tool kit and it is used for deeper analysis
import nltk
nltk.download('punkt')

In [None]:
df['text']

In [None]:
# this will give me the numbers of characters in one row sentence
df['text'].apply(len)

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

In [None]:
# number of words
df['text'].apply(lambda x:nltk.word_tokenize(x))

In [None]:
df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# for ham messages
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# for spam messages
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize = (12,8))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'], color = 'red')


In [None]:
plt.figure(figsize = (12,8))
sns.histplot(df[df['target'] == 0]['num_s'])
sns.histplot(df[df['target'] == 1]['num_s'], color = 'red')


In [None]:
sns.pairplot(df,hue = 'target')

In [None]:
sns.heatmap(df.corr())

In [None]:
sns.heatmap(df.select_dtypes(include=['number']).corr(), annot = True)

In [None]:
df.corr()

In [None]:
df.select_dtypes(include=['number']).corr()

## 3. Data Preprocessing

In [None]:
def transform_text(text):
    text = text.lower()
    return text

In [None]:
transform_text('Hello How are You')

In [None]:
def transform_text1(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    return text

In [None]:
# after adding word_tokenize to the function the output is 
transform_text1('Hello How are You')

In [None]:
df['text'][0]

In [None]:
# to elemenate special cheracters we use
def transform_text3(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    return y

In [None]:
# this will elemenate % and %% from below
transform_text3('Hello How are You 20% %% eg')

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# the solution for above problem is as
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
import string
string.punctuation

In [None]:
# to elemenate stopwords we use
def transform_text4(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    return y

In [None]:
# here all the words that are not contributing to the meaning of the sentence are elemenated 
transform_text4('Hello How are You 20% %% eg')

In [None]:
transform_text4('Did you like my presentation on ML?')

In [None]:
# this function is used to get words in their root words
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('dancing')

In [None]:
# to elemenate stopwords we use
def transform_text5(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
            
    return " ".join(y)

In [None]:
transform_text5('I Loved the YT Lectures on Machine Learniong, How about you?')

In [None]:
df['text'][0]

In [None]:
transform_text5('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

In [None]:
df['text'].apply(transform_text5)

In [None]:
df['transformed_text'] = df['text'].apply(transform_text5)

In [None]:
df.head()

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width = 50,height = 50, min_font_size = 10, background_color = 'black')

In [None]:
wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep = " "))

In [None]:
wc = WordCloud(font_path='/Library/Fonts/df',
               width=50, height=50, min_font_size=10, background_color='black')


In [None]:
wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep = " "))