### Machine Learning Pipeline - The Experiments of the Entire Wrokflow

#### Basic Imports ??

In [1]:
# Importing necessary libraries
import numpy as np                # For numerical operations
import pandas as pd               # For data manipulation and analysis
import matplotlib.pyplot as plt   # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')       # Downloading tokenizer data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Read the CSV file
df = pd.read_csv("spam.csv")

df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Display the last few rows of the DataFrame
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [5]:
# Sum of All Nulls 
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
# So Remove These 3 Nulls Columns
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Rename the columns name
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)

df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Data Preprocessing ??

In [8]:
df["target"]

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: target, Length: 5572, dtype: object

In [9]:
df["target"].value_counts()

target
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

## Apply Label Encoding For Target Col Wgich Consist of 2 Values (ham -> 0, spam -> 1)
df["target"] = encoder.fit_transform(df["target"])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
## Checking duplicate values
df.duplicated().sum()

np.int64(403)

In [12]:
len(df)

5572

In [None]:
## Removing Duplicate
df = df.drop_duplicates(keep='first')

In [14]:
len(df)

5169

#### Feature Engineering ??

In [None]:
## Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

## Importing the string module for handling special characters
import string

## Creating an instance of the Porter Stemmer
ps = PorterStemmer()

##### Stemming Is Nothing, but Only I Returning The Word To Its Root Word
e.g. 
- played -> play
- playing -> play
- plays -> play
- player -> play

In [16]:
## Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    
    # Removing special characters
    y = []
    for c in text:
        if c.isalnum():
            y.append(c)
    
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for c in text:
        if c not in stopwords.words('english') and c not in string.punctuation:
            y.append(c)
    
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for c in text:
        y.append(ps.stem(c))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [17]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [18]:
df["transformed_text"] = df["text"].apply(transform_text)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["transformed_text"] = df["text"].apply(transform_text)


Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf = TfidfVectorizer(max_features=500)

In [20]:
X = tfidf.fit_transform(df["transformed_text"]).toarray()

y = df["target"].values

In [21]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(5169, 500))

In [22]:
y

array([0, 0, 1, ..., 0, 0, 0], shape=(5169,))

#### Train Test Split ??

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
print(f"X_train Shape -> {X_train.shape}")
print(f"X_test Shape -> {X_test.shape}")
print(f"y_train Shape -> {y_train.shape}")
print(f"y_test Shape -> {y_test.shape}")

X_train Shape -> (4135, 500)
X_test Shape -> (1034, 500)
y_train Shape -> (4135,)
y_test Shape -> (1034,)


#### Start Model Training ??

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [27]:
svc = SVC(kernel="sigmoid", gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc = BaggingClassifier(n_estimators=50, random_state=42)
etc = ExtraTreesClassifier(n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=42)
xgb  = XGBClassifier(n_estimators=50, random_state=42)

In [42]:
## Make a Dictionary Of These Model To Make Easy To Deal
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
}

#### Model Evaluation ??

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [43]:
def train_classifier (clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train, y_train)
    
    y_pred = clfs.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    return accuracy, precision, recall, f1

In [33]:
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

In [46]:
for name, clf in clfs.items():
    curr_accuracy, curr_precision, curr_recall, curr_f1 = train_classifier(clf, X_train, y_train, X_test, y_test)

    print(f"For {name}")
    print(f"Accuracy: {curr_accuracy}")
    print(f"Precision: {curr_precision}")
    print(f"Recall: {curr_recall}")
    print(f"F1: {curr_f1}")

    print("\n------------------------------\n")

    accuracy_scores.append(curr_accuracy)
    precision_scores.append(curr_precision)
    recall_scores.append(curr_recall)
    f1_scores.append(curr_f1)

For SVC
Accuracy: 0.9709864603481625
Precision: 0.952755905511811
Recall: 0.8344827586206897
F1: 0.8897058823529411

------------------------------

For KNN
Accuracy: 0.9294003868471954
Precision: 0.9736842105263158
Recall: 0.5103448275862069
F1: 0.669683257918552

------------------------------

For NB
Accuracy: 0.9758220502901354
Precision: 0.9838709677419355
Recall: 0.8413793103448276
F1: 0.9070631970260223

------------------------------

For DT
Accuracy: 0.9303675048355899
Precision: 0.8288288288288288
Recall: 0.6344827586206897
F1: 0.71875

------------------------------

For LR
Accuracy: 0.9564796905222437
Precision: 0.8968253968253969
Recall: 0.7793103448275862
F1: 0.8339483394833949

------------------------------

For RF
Accuracy: 0.9729206963249516
Precision: 0.968
Recall: 0.8344827586206897
F1: 0.8962962962962963

------------------------------

For Adaboost
Accuracy: 0.9158607350096711
Precision: 0.7959183673469388
Recall: 0.5379310344827586
F1: 0.6419753086419753

-------