In [1]:
def setup_nltk(base_path=r"C:\nltk_data"):
    """
    Ensures NLTK is fully set up with all required resources.
    Works across environments and Python versions (including 3.13).
    """
    import os, nltk
    
    # 1️⃣ Create directory if missing
    os.makedirs(base_path, exist_ok=True)
    
    # 2️⃣ Set data path
    nltk.data.path = [base_path]
    
    # 3️⃣ Required resources
    required_packages = ["punkt", "punkt_tab", "stopwords"]
    
    # 4️⃣ Download missing ones
    for pkg in required_packages:
        try:
            nltk.data.find(pkg)
        except LookupError:
            nltk.download(pkg, download_dir=base_path)
    
    print("✅ NLTK setup complete. Path:", nltk.data.path)


In [None]:
print("Hello, World!")

In [None]:
#!pip install numpy
# !pip install pandas
# !pip install matplotlib

In [None]:
# !pip install wordcloud

In [2]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
%matplotlib inline

# Importing WordCloud for text visualization
from wordcloud import WordCloud

# Importing NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords


# Downloading NLTK data
# nltk.download('stopwords')   # Downloading stopwords data
# nltk.download('punkt')       # Downloading tokenizer data

In [3]:
# Read the CSV file
df = pd.read_csv('spam.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Rename the columns name
df.rename(columns = {'v1': 'target', 'v2': 'text'}, inplace = True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# !pip install scikit-learn


In [6]:
#Data Processing


from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
#check duplicate values
df.duplicated().sum()

np.int64(403)

In [8]:
len(df)

5572

In [9]:
#remove Duplicate
df = df.drop_duplicates(keep = 'first')
len(df)

5169

Feature Engg

In [10]:
# import nltk

# # Define download path explicitly (this avoids hidden folder issues)
# nltk.download('punkt', download_dir='nltk_data')
# nltk.download('stopwords', download_dir='nltk_data')

# # Add this path manually so NLTK can find it
# nltk.data.path.append('nltk_data')

# print("✅ punkt and stopwords downloaded successfully and path added")


In [None]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer

# ps = PorterStemmer()

# text = "Go until Jurong Point, crazy!! Available only in Bugis n Great World la e buffet..."
# tokens = nltk.word_tokenize(text)

# print(tokens[:10])


In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')


In [None]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# import string


In [11]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

In [15]:

# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [16]:
sample_text = "Go until Jurong Point, crazy!! Available only in Bugis n Great World la e buffet... Cine there got Amore wat..."
print(transform_text(sample_text))


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Admin/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.2544.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\Admin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [21]:
# import shutil, os

# nltk_path = r"E:\\nltk_data" 
# if os.path.exists(nltk_path):
#     shutil.rmtree(nltk_path)
# os.makedirs(nltk_path, exist_ok=True)


In [22]:
import nltk

nltk.download('punkt', download_dir=r"C:\nltk_data")
nltk.download('stopwords', download_dir=r"C:\nltk_data")


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
import os
print(os.listdir(r"C:\nltk_data"))


['corpora', 'tokenizers']


In [24]:
print(os.listdir(r"C:\nltk_data\corpora"))
print(os.listdir(r"C:\nltk_data\tokenizers"))


['stopwords', 'stopwords.zip']
['punkt', 'punkt.zip']


In [25]:
import os

base = r"C:\nltk_data"
corpora_zip = os.path.join(base, "corpora", "stopwords.zip")
tokenizers_zip = os.path.join(base, "tokenizers", "punkt.zip")

# Delete if they exist
for path in [corpora_zip, tokenizers_zip]:
    if os.path.exists(path):
        os.remove(path)
        print("Deleted:", path)
    else:
        print("Not found:", path)


Deleted: C:\nltk_data\corpora\stopwords.zip
Deleted: C:\nltk_data\tokenizers\punkt.zip


In [26]:
import os

print("C:\\nltk_data contents:", os.listdir(r"C:\nltk_data"))
print("C:\\nltk_data\\corpora contents:", os.listdir(r"C:\nltk_data\\corpora"))
print("C:\\nltk_data\\tokenizers contents:", os.listdir(r"C:\nltk_data\\tokenizers"))


C:\nltk_data contents: ['corpora', 'tokenizers']
C:\nltk_data\corpora contents: ['stopwords']
C:\nltk_data\tokenizers contents: ['punkt']


In [28]:
import nltk
nltk.data.path = [r"C:\nltk_data"]

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "Finally our NLTK environment works perfectly without any LookupError!"
tokens = word_tokenize(text)
filtered = [w for w in tokens if w.lower() not in stopwords.words('english')]

print("Original tokens:", tokens)
print("Filtered tokens:", filtered)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\nltk_data'
**********************************************************************


In [29]:
import nltk
nltk.download('punkt_tab', download_dir=r"C:\nltk_data")


[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [30]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "Finally our NLTK environment works perfectly without any LookupError!"
tokens = word_tokenize(text)
filtered = [w for w in tokens if w.lower() not in stopwords.words('english')]

print("Original tokens:", tokens)
print("Filtered tokens:", filtered)


Original tokens: ['Finally', 'our', 'NLTK', 'environment', 'works', 'perfectly', 'without', 'any', 'LookupError', '!']
Filtered tokens: ['Finally', 'NLTK', 'environment', 'works', 'perfectly', 'without', 'LookupError', '!']
