## Main Library 

In [2]:
# Reading Data & Visaulization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Extraction & Encoder & Splitting Data
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Building Model
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [3]:
# Reading Data

data = pd.read_csv(r"D:\Courses language programming\5_Machine Learning\Dataset For Machine Learning\Spam_Mail\mail_data.csv")

In [4]:
data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
data["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

## Spam ==> 1,  Ham ==> 0

In [7]:
data.loc[data["Category"] == "spam", "Category"] = 1
data.loc[data["Category"] == "ham", "Category"] = 0

In [8]:
data.head(5)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
x_input = data["Message"]
y_output = data["Category"]

x_train, x_test, y_train, y_test = train_test_split(x_input, y_output, train_size=0.7, random_state=42)

## ----------------------------------------------------------------------------------------------------

In [11]:
x_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
2505    Hello, my boytoy! I made it home and my consta...
2525    FREE entry into our £250 weekly comp just send...
4975    Aiyo u so poor thing... Then u dun wan 2 eat? ...
650     You have won ?1,000 cash or a ?2,000 prize! To...
4463    Sorry I flaked last night, shit's seriously go...
Name: Message, Length: 1672, dtype: object

## Feature Extraction

In [12]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [14]:
new_x_train = feature_extraction.fit_transform(x_train)
new_x_test = feature_extraction.transform(x_test)

y_train = y_train.astype("int")
y_test = y_test.astype("int")

# Building model

### Model 1 ==> LogisticRegression

In [15]:
model1 = LogisticRegression()
model1.fit(new_x_train, y_train)

print(f"The Accuracy Training Data Score is {model1.score(new_x_train, y_train)}")
print(f"The Accuracy Testing Data Score is {model1.score(new_x_test, y_test)}")

The Accuracy Training Data Score is 0.963076923076923
The Accuracy Testing Data Score is 0.9659090909090909


### Model 2 ==> XGBoost_Classifier

In [16]:
model2 = xgb.XGBClassifier()
model2.fit(new_x_train, y_train)

print(f"The Accuracy Training Data Score is {model2.score(new_x_train, y_train)}")
print(f"The Accuracy Testing Data Score is {model2.score(new_x_test, y_test)}")

The Accuracy Training Data Score is 0.9907692307692307
The Accuracy Testing Data Score is 0.9772727272727273


### Make Grid_search ==> To  Make THe Best Param in model

In [13]:
param_grid = {"n_estimators": list(range(10, 200, 20)),
              "max_depth": list(range(10, 100, 20)),
               "max_leaves": list(range(10, 100, 20))}

model_grid = GridSearchCV(estimator=xgb.XGBClassifier(n_estimators=10, max_depth=1000, max_leaves=10), 
                         param_grid= param_grid, 
                         cv=10, 
                         verbose=6, n_jobs=-1,
                         scoring="accuracy")

model_grid.fit(new_x_train, y_train)

Fitting 10 folds for each of 250 candidates, totalling 2500 fits


In [14]:
print(f"The Best Param is {model_grid.best_params_}")
            
print("-" * 50)

print(f"The Accuracy Training Data is {model_grid.score(new_x_train, y_train)}")
print(f"The Accuracy Testing Data is {model_grid.score(new_x_test, y_test)}")

The Best Param is {'max_depth': 10, 'max_leaves': 10, 'n_estimators': 70}
--------------------------------------------------
The Accuracy Training Data is 0.992051282051282
The Accuracy Testing Data is 0.9760765550239234
