## Main Library

In [1]:
# Reading Data & Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import r2_score, accuracy_score

# Building Neural Network
import tensorflow as tf
import tensorflow.keras as k


# Building model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Reading Data
data = pd.read_csv(r"D:\Courses language programming\5_Machine Learning\Dataset For Machine Learning\Spam_Mail\mail_data.csv")

In [3]:
data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
data["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

#### Data Not have null value and have over data to ham

In [6]:
data.loc[data["Category"] == "spam", "Category"] = 1
data.loc[data["Category"] == "ham", "Category"] = 0

In [7]:
data.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data["Message"].value_counts()

Message
Sorry, I'll call later                                                                                                                                      30
I cant pick the phone right now. Pls send a message                                                                                                         12
Ok...                                                                                                                                                       10
Ok                                                                                                                                                           4
Ok.                                                                                                                                                          4
                                                                                                                                                            ..
I gotta collect da car at 6 lei.      

## Make Split ==> Data To Building Model

In [9]:
x_input = data["Message"]
y_output = data["Category"]

x_train, x_test, y_train, y_test = train_test_split(x_input, y_output, train_size=0.7, random_state=42)

In [10]:
x_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
2505    Hello, my boytoy! I made it home and my consta...
2525    FREE entry into our £250 weekly comp just send...
4975    Aiyo u so poor thing... Then u dun wan 2 eat? ...
650     You have won ?1,000 cash or a ?2,000 prize! To...
4463    Sorry I flaked last night, shit's seriously go...
Name: Message, Length: 1672, dtype: object

##  ------------------------------------------------------------------------------------------------------
### Make Feature Extraction - To transform Test ---> numeric Vector 
##  ------------------------------------------------------------------------------------------------------

In [11]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [12]:
new_x_train = feature_extraction.fit_transform(x_train)
new_x_test = feature_extraction.transform(x_test)

y_train = y_train.astype("int")
y_test = y_test.astype("int")

##  ------------------------------------------------------------------------------------------------------
##  Make over sampling 
##  ------------------------------------------------------------------------------------------------------

In [13]:
new_x, new_y = SMOTE().fit_resample(new_x_train, y_train)
new_x2, new_y2 = SMOTE().fit_resample(new_x_test, y_test)

In [14]:
new_y.value_counts(), new_y2.value_counts()

(Category
 0    3377
 1    3377
 Name: count, dtype: int64,
 Category
 0    1448
 1    1448
 Name: count, dtype: int64)

## Building Neural Network

In [15]:
# model = k.models.Sequential([
#     k.layers.Dense(128, activation="relu"),
#     k.layers.Dense(1, activation="sigmoid")
# ])

# model.compile(optimizer="adam", 
#                   loss=k.losses.CategoricalCrossentropy(), 
#                  metrics=["accuracy"])
# # k.metrics.binary_accuracy

In [16]:
# model.fit(new_x_train, y_train, validation_data=(new_x_test, y_test))

## Building Model => AdaBoostClassifier

#### Training without oversampling

In [17]:
Adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=100, 
                                                               min_samples_split=5,
                                                               min_samples_leaf=6),
                              n_estimators=200,
                              learning_rate=0.2)


Adaboost.fit(new_x_train, y_train)

print(f"The predict Score Train is ==> {Adaboost.score(new_x_train, y_train)}")
print("%----------------------------------------------------------%")
print(f"The predict Score Test is ==> {Adaboost.score(new_x_test, y_test)}")

The predict Score Train is ==> 0.9992307692307693
%----------------------------------------------------------%
The predict Score Test is ==> 0.97188995215311


#### Training with  oversampling

In [18]:
Adaboost_smote = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=50,
                                                                     min_samples_leaf=6,
                                                                     min_samples_split=5),
                                    n_estimators=100,
                                    learning_rate=0.1)


Adaboost_smote.fit(new_x, new_y)

print(f"The predict Score Train is ==> {Adaboost_smote.score(new_x, new_y)}")
print("%----------------------------------------------------------%")
print(f"The predict Score Test is ==> {Adaboost_smote.score(new_x2, new_y2)}")

The predict Score Train is ==> 0.9998519395913533
%----------------------------------------------------------%
The predict Score Test is ==> 0.9651243093922652


## Building System Predictor

In [78]:
new_text = pd.DataFrame(data=[input()], columns=["Message"])

text = feature_extraction.transform(new_text)

prediction = Adaboost.predict(text)

if prediction[-1] == 0: print("Ham")
else: print("Spam")

Did you hear about the new ""Divorce Barbie""? It comes with all of Ken's stuff!
Ham


In [81]:
print(new_text["Message"])
print("\n")
print(prediction)

0    Did you hear about the new ""Divorce Barbie""?...
Name: Message, dtype: object


[0]
