# Imported Dataset
---

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

import pandas as pd
df = pd.read_csv(path + "/spam.csv", encoding='latin-1')


# Basic Info
---

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [None]:
df.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

# EDA
---

In [None]:
df['label'].value_counts().plot(kind='pie', autopct='%.2f%%')


# Features Engineering
---

In [None]:
#adding column of length of msg in dataset
df['length'] = df['text'].apply(len)

In [None]:
df.head()

# Data Processing
---

In [None]:
# label encoding for target column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# ham --> 0
# spam --> 1

In [None]:
df.head()

In [None]:
!pip install nltk

In [None]:
#transformation on text column
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
nltk.download('punkt_tab') # Download the  resource
nltk.download('stopwords')

def transfrom_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text) #seprate the words
    y = []
    for i in text:
        if i.isalnum():  #alnum --> alpha numeric only
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    ps = PorterStemmer()
    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
transfrom_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [None]:
df['transfrom_text'] = df['text'].apply(transfrom_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud

wc = WordCloud(width = 500 , height = 500 , background_color= 'white')
spam_wc = wc.generate(df[df['label'] == 1]['text'].str.cat(sep = " "))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (10,10))
plt.imshow(spam_wc)

In [None]:
df.head()

# Model building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:
x = cv.fit_transform(df['transfrom_text'])

In [None]:
df['label'].values

In [None]:
y = df['label'].values

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_train,y_train)


In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score , confusion_matrix , precision_score
acc_score = accuracy_score(y_test , y_pred)
matrix = confusion_matrix(y_test , y_pred)
pre_score = precision_score(y_test , y_pred)
print(f'accuracy_score :{acc_score}')
print(f'confusion_matrix :{matrix}')
print(f'precision_score :{pre_score}')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix = matrix , display_labels = model.classes_)
disp.plot(cmap = 'Blues')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier()
model_2.fit(x_train,y_train)

In [None]:
y_pred = model_2.predict(x_test)

In [None]:
acc_score = accuracy_score(y_test , y_pred)
matrix = confusion_matrix(y_test , y_pred)
pre_score = precision_score(y_test , y_pred)
print(f'accuracy_score :{acc_score}')
print(f'confusion_matrix :{matrix}')
print(f'precision_score :{pre_score}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
disp = ConfusionMatrixDisplay(confusion_matrix = matrix , display_labels = model.classes_)
disp.plot(cmap = 'Oranges')
plt.show()

#Grid Search cv , to get best parameters of randomforest

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
param_grid = {
    'n_estimators': [50,100],
    "max_depth": [None],
    'min_samples_leaf': [1,2,4],
    'min_samples_split': [2,5,10]
}


In [None]:
grid = GridSearchCV(estimator= model_2 , param_grid = param_grid, cv = 5 , scoring = 'accuracy')

In [None]:
grid.fit(x_train , y_train)

In [None]:
print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

# Save Model

In [None]:
best_model = grid.best_estimator_

In [None]:
import joblib
joblib.dump(best_model,'best_model.joblib',)

In [None]:
joblib.dump(transfrom_text,'transform_text.pkl')

In [None]:
# Preprocess the input text
transformed_input = transfrom_text('hello')

# Convert the transformed text to a format the model expects (e.g., using the same CountVectorizer)
# Assuming 'cv' is the CountVectorizer object used for training
# You might need to load the CountVectorizer if this cell is run in a new session
# import joblib
# cv = joblib.load('count_vectorizer.pkl') # if you saved it earlier

# Since we are in the same session, use the existing 'cv' object
transformed_input_vector = cv.transform([transformed_input])

# Make prediction
prediction = best_model.predict(transformed_input_vector)

# The prediction will be a numpy array, access the first element for the result
print(prediction[0])