In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development.

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data = pd.read_csv("kg_train.csv",encoding='latin-1')

# Reduce the training set to speed up development.
# Modify for final system
data = data.head(1000)
print(data.shape)
data.fillna("",inplace=True)

(1000, 2)


### Let's divide the training and test set into two partitions

In [None]:
data.describe()

Unnamed: 0,label
count,1000.0
mean,0.442
std,0.496873
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
data.dtypes

Unnamed: 0,0
text,object
label,int64


In [None]:
data.head()

Unnamed: 0,text,label
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1
1,Will do.,0
2,Nora--Cheryl has emailed dozens of memos about...,0
3,Dear Sir=2FMadam=2C I know that this proposal ...,1
4,fyi,0


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'label' column is the target variable
# X = data.drop('label', axis=1)
# y = data['label']

# Split the data into training and testing sets (e.g., 80% train, 20% test)
train, test = train_test_split(data, test_size=0.2, random_state=42)

print(f"Train set shape: {train.shape}")
print(f"Test set shape: {test.shape}")

Train set shape: (800, 2)
Test set shape: (200, 2)


## Data Preprocessing

In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
# Install BeautifulSoup4 if not already installed
!pip install beautifulsoup4



In [None]:
from bs4 import BeautifulSoup

def remove_html_tags_bs(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)
    return cleaned_text

# Apply the function to the 'text' column of the train DataFrame
train['preprocessed_text_bs'] = train['text'].apply(remove_html_tags_bs)

# Apply the function to the 'text' column of the test DataFrame
test['preprocessed_text_bs'] = test['text'].apply(remove_html_tags_bs)

print("HTML cleaning applied to train and test DataFrames using BeautifulSoup:")
print(train[['text', 'preprocessed_text_bs']].head())

HTML cleaning applied to train and test DataFrames using BeautifulSoup:
                                                  text  \
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...   
535  I have not been able to reach oscar this am. W...   
695  ; Huma Abedin B6I'm checking with Pat on the 5...   
557  I can have it announced here on Monday - can't...   
836      BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...   

                                  preprocessed_text_bs  
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...  
535  I have not been able to reach oscar this am. W...  
695  ; Huma Abedin B6I'm checking with Pat on the 5...  
557  I can have it announced here on Monday - can't...  
836  BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 San P...  



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, 'html.parser')


- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters

- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
def clean_text(text):

    text = re.sub(r'[^a-zA-Z]', ' ', text) # remove special characters and numbers
    text = re.sub(r'\b\w\b', '', text) # Remove all single characters
    text = re.sub(r'^\w\s+', '', text) # Remove single characters from the start
    text = re.sub(r'\s+', ' ', text) # substitute multiple spaces with single space
    text = re.sub(r"\b[bB]'", '', text) # remove prefixed 'b'
    text = str(text).lower() # convert to Lowercase
    return text.strip()

# Apply the function to the preprocessed_text_bs column of the train DataFrame
train['preprocessed_text_final'] = train['preprocessed_text_bs'].apply(clean_text)

# Apply the function to the preprocessed_text_bs column of the test DataFrame
test['preprocessed_text_final'] = test['preprocessed_text_bs'].apply(clean_text)

print("Further cleaning applied to train and test DataFrames. Displaying first 5 rows of train with new column:")
print(train[['text', 'preprocessed_text_bs', 'preprocessed_text_final']].head())

Further cleaning applied to train and test DataFrames. Displaying first 5 rows of train with new column:
                                                  text  \
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...   
535  I have not been able to reach oscar this am. W...   
695  ; Huma Abedin B6I'm checking with Pat on the 5...   
557  I can have it announced here on Monday - can't...   
836      BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...   

                                  preprocessed_text_bs  \
29   ----------- REGARDS, MR NELSON SMITH.KINDLY RE...   
535  I have not been able to reach oscar this am. W...   
695  ; Huma Abedin B6I'm checking with Pat on the 5...   
557  I can have it announced here on Monday - can't...   
836  BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 San P...   

                               preprocessed_text_final  
29   regards mr nelson smith kindly reply me on my ...  
535  have not been able to reach oscar this am we a...  
695  huma abedin checking 

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')  # Uncomment if stopwords are not downloaded
# nltk.download('wordnet')    # Uncomment if wordnet is not downloaded
# nltk.download('omw-1.4')    # Uncomment for wordnet data

stop_words = set(stopwords.words('english'))
wordnet_lemma = WordNetLemmatizer()

stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

# Define the get_wordnet_pos function
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Tokenize the preprocessed text
tokenized_docs_no_punctuation = [word_tokenize(doc) for doc in train['preprocessed_text_final']]

processed_docs = []
for i, doc in enumerate(tokenized_docs_no_punctuation[:5]):
    final_doc = []
    for word in doc:
        # Lemmatize the word with POS tagging
        lemmatized_word = wordnet_lemma.lemmatize(word, get_wordnet_pos(word))
        if lemmatized_word.lower() not in stop_words:
            final_doc.append(lemmatized_word.lower())
    processed_docs.append(final_doc)

    print(f"\nDocument {i+1}:")
    print('Before:', ' '.join(doc))
    print('After removing stop words and lemmatizing:', ' '.join(final_doc))

print(f"\nTotal documents processed and stored in 'processed_docs': {len(processed_docs)}")


Document 1:
Before: regards mr nelson smith kindly reply me on my private email address nelsonsmith yahoo com
After removing stop words and lemmatizing: regard mr nelson smith kindly reply private email address nelsonsmith yahoo com

Document 2:
Before: have not been able to reach oscar this am we are supposed to send the pdb at can receive it
After removing stop words and lemmatizing: able reach oscar suppose send pdb receive

Document 3:
Before: huma abedin checking with pat on the will work with jack jake on rest also huma has for you follow up memo from the prep call
After removing stop words and lemmatizing: huma abedin check pat work jack jake rest also huma follow memo prep call

Document 4:
Before: can have it announced here on monday can today
After removing stop words and lemmatizing: announce monday today

Document 5:
Before: bank of africaagence san pedro bp san pedro cote ivoire west africa dear sir am mrs dorise marie francoise an accountant of auditing and accounting se

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
# Stemming and Lemmanization

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter   = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet_lemma  = WordNetLemmatizer()

for word in words:
    print("---- ",word,"----")
    print('Porter Stemmer:',porter.stem(word))
    print('Snowball Stemmer:',snowball.stem(word))
    print('WordNet Lemmatizer (noun):',wordnet_lemma.lemmatize(word))
    print('WordNet Lemmatizer (verb):',wordnet_lemma.lemmatize(word,pos="v"))
    print()

----  running ----
Porter Stemmer: run
Snowball Stemmer: run
WordNet Lemmatizer (noun): running
WordNet Lemmatizer (verb): run

----  runs ----
Porter Stemmer: run
Snowball Stemmer: run
WordNet Lemmatizer (noun): run
WordNet Lemmatizer (verb): run

----  ran ----
Porter Stemmer: ran
Snowball Stemmer: ran
WordNet Lemmatizer (noun): ran
WordNet Lemmatizer (verb): run

----  easily ----
Porter Stemmer: easili
Snowball Stemmer: easili
WordNet Lemmatizer (noun): easily
WordNet Lemmatizer (verb): easily

----  fairly ----
Porter Stemmer: fairli
Snowball Stemmer: fair
WordNet Lemmatizer (noun): fairly
WordNet Lemmatizer (verb): fairly

----  trouble ----
Porter Stemmer: troubl
Snowball Stemmer: troubl
WordNet Lemmatizer (noun): trouble
WordNet Lemmatizer (verb): trouble

----  troubling ----
Porter Stemmer: troubl
Snowball Stemmer: troubl
WordNet Lemmatizer (noun): troubling
WordNet Lemmatizer (verb): trouble

----  troubled ----
Porter Stemmer: troubl
Snowball Stemmer: troubl
WordNet Lemmati

In [None]:
for i, doc in enumerate(tokenized_docs_no_punctuation[:5]):
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        #final_doc.append(snowball.stem(word)) # requires 'corpora/wordnet' -> nltk.download()
        #final_doc.append(wordnet_lemma.lemmatize(word)) # requires 'corpora/wordnet' -> nltk.download()
    print(f"\nDocument {i+1}:")
    print('Before:', ' '.join(doc))
    print('After Porter Stemming: ', ' '.join(final_doc))


Document 1:
Before: regards mr nelson smith kindly reply me on my private email address nelsonsmith yahoo com
After Porter Stemming:  regard mr nelson smith kindli repli me on my privat email address nelsonsmith yahoo com

Document 2:
Before: have not been able to reach oscar this am we are supposed to send the pdb at can receive it
After Porter Stemming:  have not been abl to reach oscar thi am we are suppos to send the pdb at can receiv it

Document 3:
Before: huma abedin checking with pat on the will work with jack jake on rest also huma has for you follow up memo from the prep call
After Porter Stemming:  huma abedin check with pat on the will work with jack jake on rest also huma ha for you follow up memo from the prep call

Document 4:
Before: can have it announced here on monday can today
After Porter Stemming:  can have it announc here on monday can today

Document 5:
Before: bank of africaagence san pedro bp san pedro cote ivoire west africa dear sir am mrs dorise marie franc

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert processed_docs to strings
docs_as_strings = [' '.join(doc) for doc in processed_docs]

# Create the Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_as_strings)

# Show the Bag of Words feature names and the document-term matrix
print("Feature Names:", vectorizer.get_feature_names_out())
print("Document-Term Matrix:\n", X.toarray())

Feature Names: ['abedin' 'able' 'account' 'accountant' 'accounting' 'act' 'address'
 'africa' 'africaagence' 'also' 'announce' 'ansfer' 'around' 'aside'
 'assistance' 'audit' 'aware' 'bank' 'belongs' 'best' 'bonafide' 'bp'
 'call' 'care' 'check' 'claim' 'colleague' 'collection' 'com' 'come'
 'comfortable' 'confidential' 'contact' 'cote' 'crash' 'customer' 'date'
 'dear' 'decease' 'development' 'die' 'dollar' 'dorise' 'dormant' 'due'
 'eit' 'email' 'ere' 'expensis' 'faithfully' 'family' 'five' 'follow'
 'foward' 'fran' 'francoise' 'fund' 'golden' 'good' 'hearing' 'hole'
 'http' 'huma' 'hundred' 'information' 'ity' 'ivoire' 'jack' 'jake' 'kin'
 'kindly' 'lie' 'life' 'look' 'loope' 'mail' 'make' 'marie' 'memo'
 'mention' 'million' 'mod' 'monday' 'money' 'mr' 'necessary' 'nelson'
 'nelsonsmith' 'next' 'notify' 'number' 'oise' 'one' 'opportun' 'oscar'
 'ousand' 'owner' 'pat' 'pdb' 'pedro' 'plane' 'prep' 'prepare' 'presen'
 'private' 'proposal' 'protection' 'reach' 'receive' 'red' 'regard'
 

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","€",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

train['money_mark'] = train['preprocessed_text_final'].str.contains(money_simbol_list)*1
train['suspicious_words'] = train['preprocessed_text_final'].str.contains(suspicious_words)*1
train['text_len'] = train['preprocessed_text_final'].apply(lambda x: len(x))

test['money_mark'] = test['preprocessed_text_final'].str.contains(money_simbol_list)*1
test['suspicious_words'] = test['preprocessed_text_final'].str.contains(suspicious_words)*1
test['text_len'] = test['preprocessed_text_final'].apply(lambda x: len(x))

train.head()

Unnamed: 0,text,label,preprocessed_text_bs,preprocessed_text_final,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",regards mr nelson smith kindly reply me on my ...,0,0,89
535,I have not been able to reach oscar this am. W...,0,I have not been able to reach oscar this am. W...,have not been able to reach oscar this am we a...,0,0,91
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,; Huma Abedin B6I'm checking with Pat on the 5...,huma abedin checking with pat on the will work...,0,0,125
557,I can have it announced here on Monday - can't...,0,I can have it announced here on Monday - can't...,can have it announced here on monday can today,0,0,46
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 San P...,bank of africaagence san pedro bp san pedro co...,1,1,1577


## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
# Initialize CountVectorizer
countvector_bigram = CountVectorizer(ngram_range=(2,2))

# Fit and transform the training data
X_train_bigram = countvector_bigram.fit_transform(train['preprocessed_text_final'])

# Transform the test data
X_test_bigram = countvector_bigram.transform(test['preprocessed_text_final'])

print(f"Shape of Bag of Words (bigrams) matrix for training data: {X_train_bigram.shape}")
print(f"Shape of Bag of Words (bigrams) matrix for test data: {X_test_bigram.shape}")

# Display some of the bigram feature names
print("\nTop 100 bigram features learned by CountVectorizer:")
bigram_feature_names = countvector_bigram.get_feature_names_out()
print(bigram_feature_names[:100])

Shape of Bag of Words (bigrams) matrix for training data: (800, 83317)
Shape of Bag of Words (bigrams) matrix for test data: (200, 83317)

Top 100 bigram features learned by CountVectorizer:
['aa ff' 'aa gr' 'aa hjqg' 'aa hotmail' 'aa kcc' 'aa kk' 'aa nh'
 'aa nxuaz' 'aa plutjamnavtjkyd' 'aa ppr' 'aa yox' 'aa yvzngm' 'aaa fqews'
 'aabeiawaeaambiqaceqedeqh gamaweaahedeqa' 'aac dqx' 'aacw dqzdjuu'
 'aae ftf'
 'aaecaxeebsexbhjbuqdhcrmimoeifekrobhbcsmzuvavynlrchyknoel rcygromjygpkju'
 'aaegmdbsch gehxokbnpixtaqyz' 'aaeh me' 'aaevvsghq sci' 'aafh ak'
 'aaftmjiprqbge rskdnbihgb' 'aafyb ytscetiy' 'aafyuzt pcepmy'
 'aagadrk petagi' 'aagcdm nac' 'aaghhq ub' 'aahfx mcsd' 'aahjjb uwlwzyq'
 'aahjoetoj pdk' 'aahvf ejpz' 'aai oefdcvrbwbc' 'aain kxpijxrzjxp'
 'aaj aop' 'aajf fq' 'aajrsis lq' 'aajv ib' 'aajwuaxf myj' 'aakb lwtcgp'
 'aakfwl wcr' 'aakmofyro rvz' 'aakpg rsfipd' 'aal ab' 'aal amfmjhu'
 'aal th' 'aal zxn' 'aalc gqappdl' 'aalfbz nya' 'aalxgfzfi srw'
 'aalyw wuut' 'aaosabcnpkgmz oaxrshsaqc' 

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidfvector = TfidfVectorizer(ngram_range=(2,3))

# Fit and transform the training data mn
X_train_tfidf = tfidfvector.fit_transform(train['preprocessed_text_final'])

# Transform the test data
X_test_tfidf = tfidfvector.transform(test['preprocessed_text_final'])

print(f"Shape of TF-IDF matrix for training data: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF matrix for test data: {X_test_tfidf.shape}")

# Display some of the feature names
print("\nTop 100 TF-IDF features learned by TfidfVectorizer:")
tfidf_feature_names = tfidfvector.get_feature_names_out()
print(tfidf_feature_names[:100])

Shape of TF-IDF matrix for training data: (800, 202374)
Shape of TF-IDF matrix for test data: (200, 202374)

Top 100 TF-IDF features learned by TfidfVectorizer:
['aa ff' 'aa gr' 'aa gr wccvjdmh' 'aa hjqg' 'aa hjqg echaxugyw'
 'aa hotmail' 'aa hotmail com' 'aa kcc' 'aa kcc fbiwjn' 'aa kk'
 'aa kk sngosi' 'aa nh' 'aa nh rro' 'aa nxuaz' 'aa nxuaz lox'
 'aa plutjamnavtjkyd' 'aa plutjamnavtjkyd gz' 'aa ppr' 'aa ppr xfn'
 'aa yox' 'aa yox cs' 'aa yvzngm' 'aa yvzngm yoiojd' 'aaa fqews'
 'aaa fqews yjpfp' 'aabeiawaeaambiqaceqedeqh gamaweaahedeqa'
 'aabeiawaeaambiqaceqedeqh gamaweaahedeqa aocgeebt' 'aac dqx' 'aac dqx bh'
 'aacw dqzdjuu' 'aacw dqzdjuu va' 'aae ftf' 'aae ftf xkeetz'
 'aaecaxeebsexbhjbuqdhcrmimoeifekrobhbcsmzuvavynlrchyknoel rcygromjygpkju'
 'aaecaxeebsexbhjbuqdhcrmimoeifekrobhbcsmzuvavynlrchyknoel rcygromjygpkju nzg'
 'aaegmdbsch gehxokbnpixtaqyz' 'aaegmdbsch gehxokbnpixtaqyz uce' 'aaeh me'
 'aaeh me ant' 'aaevvsghq sci' 'aaevvsghq sci tk' 'aafh ak'
 'aafh ak aofxoeszwkyt' 'aaftm

## And the Train a Classifier?

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier using Bag of Words (bigrams) features from the training data
# train['label'] is the target variable for the training data
classifier.fit(X_train_bigram, train['label'])

# Predict on the test data using Bag of Words (bigrams)
predictions = classifier.predict(X_test_bigram)

# Evaluate
print("Classifier: MultinomialNB (using Bag of Words - Bigrams)")
print(f"Accuracy: {accuracy_score(test['label'], predictions):.4f}")
print("\nClassification Report:\n", classification_report(test['label'], predictions))

Classifier: MultinomialNB (using Bag of Words - Bigrams)
Accuracy: 0.8850

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.84      0.90       125
           1       0.78      0.96      0.86        75

    accuracy                           0.89       200
   macro avg       0.88      0.90      0.88       200
weighted avg       0.90      0.89      0.89       200



### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

Import DATA

In [None]:
!kaggle competitions download -c dsub-fraudulentemails

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.12/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [None]:
from google.colab import files

print("upload your kaggle.json file")
files.upload()

Please upload your kaggle.json file. You can generate it from your Kaggle account settings (Profile -> Account -> Create New API Token).


Saving kg_train1.csv to kg_train1 (1).csv




In [None]:
# Create the .kaggle directory if it doesn't exist
!mkdir -p ~/.kaggle

# Move the uploaded kaggle.json to the correct directory
!mv kaggle.json ~/.kaggle/

# Set read/write permissions for the owner only to secure the API key
!chmod 600 ~/.kaggle/kaggle.json

print("Kaggle API key setup complete!")

mv: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Kaggle API key setup complete!


In [None]:
# Now, retry downloading the dataset
!kaggle competitions download -c dsub-fraudulentemails

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.12/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


Multinomial Naive Bayes classifier.



In [105]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

def evaluate_classifier(X_train, X_test, y_train, y_test):
    """
    Trains a Multinomial Naive Bayes classifier, makes predictions,
    and prints the accuracy and classification report.
    """
    # Initialize the Multinomial Naive Bayes classifier with default parameters
    classifier = MultinomialNB()

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Make predictions on the test data
    predictions = classifier.predict(X_test)

    # Print the accuracy score
    print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")

    # Print the classification report
    print("\nClassification Report:\n", classification_report(y_test, predictions))

In [106]:
print("Evaluating with Bag of Words (bigrams) only:")
evaluate_classifier(X_train_bigram, X_test_bigram, train['label'], test['label'])

Evaluating with Bag of Words (bigrams) only:
Accuracy: 0.8850

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.84      0.90       125
           1       0.78      0.96      0.86        75

    accuracy                           0.89       200
   macro avg       0.88      0.90      0.88       200
weighted avg       0.90      0.89      0.89       200



Evaluate with TF-IDF

In [107]:
print("Evaluating with TF-IDF (bigrams and trigrams) only:")
evaluate_classifier(X_train_tfidf, X_test_tfidf, train['label'], test['label'])

Evaluating with TF-IDF (bigrams and trigrams) only:
Accuracy: 0.8850

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.85      0.90       125
           1       0.79      0.95      0.86        75

    accuracy                           0.89       200
   macro avg       0.88      0.90      0.88       200
weighted avg       0.90      0.89      0.89       200



In [108]:
import scipy.sparse

# Prepare extra features for training data
X_train_extra = train[['money_mark', 'suspicious_words', 'text_len']].values

# Combine Bag of Words (bigrams) with extra features for training data
X_train_bigram_combined = scipy.sparse.hstack((X_train_bigram, X_train_extra))

# Prepare extra features for test data
X_test_extra = test[['money_mark', 'suspicious_words', 'text_len']].values

# Combine Bag of Words (bigrams) with extra features for test data
X_test_bigram_combined = scipy.sparse.hstack((X_test_bigram, X_test_extra))

print("Evaluating with Bag of Words (bigrams) + extra flags:")
evaluate_classifier(X_train_bigram_combined, X_test_bigram_combined, train['label'], test['label'])

Evaluating with Bag of Words (bigrams) + extra flags:
Accuracy: 0.4050

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.05      0.09       125
           1       0.39      1.00      0.56        75

    accuracy                           0.41       200
   macro avg       0.69      0.52      0.32       200
weighted avg       0.77      0.41      0.27       200



combine the TF-IDF.



In [109]:
import scipy.sparse

# Prepare extra features for training data
X_train_extra = train[['money_mark', 'suspicious_words', 'text_len']].values

# Combine TF-IDF (bigrams and trigrams) with extra features for training data
X_train_tfidf_combined = scipy.sparse.hstack((X_train_tfidf, X_train_extra))

# Prepare extra features for test data
X_test_extra = test[['money_mark', 'suspicious_words', 'text_len']].values

# Combine TF-IDF (bigrams and trigrams) with extra features for test data
X_test_tfidf_combined = scipy.sparse.hstack((X_test_tfidf, X_test_extra))

print("Evaluating with TF-IDF (bigrams and trigrams) + extra flags:")
evaluate_classifier(X_train_tfidf_combined, X_test_tfidf_combined, train['label'], test['label'])

Evaluating with TF-IDF (bigrams and trigrams) + extra flags:
Accuracy: 0.3850

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.02      0.03       125
           1       0.38      1.00      0.55        75

    accuracy                           0.39       200
   macro avg       0.69      0.51      0.29       200
weighted avg       0.77      0.39      0.23       200



### Summary of Classifier Performance

1.  **Bag of Words (bigrams) only:**
    *   Accuracy: 0.8850
    *   Weighted Avg F1-score: 0.89

2.  **TF-IDF (bigrams and trigrams) only:**
    *   Accuracy: 0.8850
    *   Weighted Avg F1-score: 0.89

3.  **Bag of Words (bigrams) + extra flags ('money_mark', 'suspicious_words', 'text_len'):**
    *   Accuracy: 0.4050
    *   Weighted Avg F1-score: 0.27

4.  **TF-IDF (bigrams and trigrams) + extra flags ('money_mark', 'suspicious_words', 'text_len'):**
    *   Accuracy: 0.3850
    *   Weighted Avg F1-score: 0.23

**Conclusion:**

Based on these evaluations, both **Bag of Words (bigrams) only** and **TF-IDF (bigrams and trigrams) only** yielded the best performance. The addition of the extra features ('money_mark', 'suspicious_words', 'text_len') significantly degraded the model's performance in both Bag of Words and TF-IDF scenarios. Therefore, for this specific dataset and classifier, using only text-based features without the engineered 'extra flags' results in the most effective SPAM/HAM classification.