In [1]:
import pandas as pd

# Load the CSV file with the correct encoding
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

# Ensure the necessary columns exist and are named correctly
# Uncomment and modify the line below if necessary to rename columns
# df = df.rename(columns={'v1': 'target', 'v2': 'transformed_text'})

# Display the first few rows to confirm the data is loaded correctly
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)


In [2]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head(3)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [3]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [4]:
df['target']=encoder.fit_transform(df['target'])
df.drop_duplicates(keep='first',inplace=True)

In [5]:
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
import nltk
nltk.download('punkt')
df['num_characters']=df['text'].apply(len)
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
from nltk.corpus import stopwords
import string 
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TANIYA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(text)

In [8]:
transform_text(df['text'][10])

'gon na home soon want talk stuff anymore tonight k cried enough today'

In [9]:
df['transformed_text']=df['text'].apply(transform_text)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

# Train the CountVectorizer
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Train the MultinomialNB model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Check training results
y_pred = mnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Assuming multi-class classification

print(f"Accuracy: {accuracy}, Precision: {precision}")


Accuracy: 0.97678916827853, Precision: 0.9766607239105574


In [13]:
import pickle

# Save the CountVectorizer
with open('true_vectorizers.pkl', 'wb') as file:
    pickle.dump(cv, file)
    print("Vectorizer saved successfully")

# Save the trained MultinomialNB model
with open('true_models.pkl', 'wb') as file:
    pickle.dump(mnb, file)
    print("Model saved successfully")


Vectorizer saved successfully
Model saved successfully


In [2]:
import pickle

In [3]:
# Load the CountVectorizer
with open('true_vectorizers.pkl', 'rb') as file:
    loaded_cv = pickle.load(file, encoding='ISO-8859-1')
    print("Vectorizer loaded successfully")

# Load the trained MultinomialNB model
with open('true_models.pkl', 'rb') as file:
    loaded_mnb = pickle.load(file ,encoding='ISO-8859-1')
    print("Model loaded successfully")


Vectorizer loaded successfully
Model loaded successfully


In [4]:
sample_text = ["example text to transform"]
transformed_sample = loaded_cv.transform(sample_text).toarray() # 2d array countvectorizer step

# Predict using the loaded MultinomialNB model
prediction = loaded_mnb.predict(transformed_sample)
print(prediction)

[0]


In [6]:
sample_texts = [
    "example text to transform",
    "free money win now",
    "please confirm your subscription",
    "schedule a meeting for tomorrow",
    "urgent: your account has been compromised"
]

# Transform the sample texts using the loaded CountVectorizer
transformed_samples = loaded_cv.transform(sample_texts).toarray()

# Predict using the loaded MultinomialNB model
predictions = loaded_mnb.predict(transformed_samples)

# Print predictions for each sample text
for text, prediction in zip(sample_texts, predictions):
    print(f"Text: {text} -> Prediction: {prediction}")

Text: example text to transform -> Prediction: 0
Text: free money win now -> Prediction: 1
Text: please confirm your subscription -> Prediction: 1
Text: schedule a meeting for tomorrow -> Prediction: 0
Text: urgent: your account has been compromised -> Prediction: 1
