In [40]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import re
import nltk
import string
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [41]:
# Step 2: Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Samruddhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# Step 3: Create sample data
data = {
    'text': [
        "Cats are running in the garden!",
        "The dog barked loudly at night.",
        "Birds are flying over the hills.",
        "She is reading a book on the balcony.",
    ],
    'label': ['animal', 'animal', 'animal', 'human']
}
df = pd.DataFrame(data)
print("Step Original Data:\n", df, "\n")

Step Original Data:
                                     text   label
0        Cats are running in the garden!  animal
1        The dog barked loudly at night.  animal
2       Birds are flying over the hills.  animal
3  She is reading a book on the balcony.   human 



In [43]:
# Step 4: Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)
print("Step Cleaned Text:\n", df[['text', 'clean_text']], "\n")


Step Cleaned Text:
                                     text                            clean_text
0        Cats are running in the garden!        cats are running in the garden
1        The dog barked loudly at night.        the dog barked loudly at night
2       Birds are flying over the hills.       birds are flying over the hills
3  She is reading a book on the balcony.  she is reading a book on the balcony 



In [44]:
# Save cleaned data
df[['text', 'clean_text']].to_csv('step4_cleaned_text.csv', index=False)


In [45]:
# Step 5: Lemmatization & Stop word removal
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(lemmatized)

df['processed_text'] = df['clean_text'].apply(preprocess_text)
print("Step Processed Text (lemmatized & stopwords removed):\n", df[['clean_text', 'processed_text']], "\n")


Step Processed Text (lemmatized & stopwords removed):
                              clean_text           processed_text
0        cats are running in the garden       cat running garden
1        the dog barked loudly at night  dog barked loudly night
2       birds are flying over the hills         bird flying hill
3  she is reading a book on the balcony     reading book balcony 



In [46]:
# Save processed data
df[['text', 'processed_text']].to_csv('step5_processed_text.csv', index=False)


In [47]:
# Step 6: Label Encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
print("Step Encoded Labels:\n", df[['label', 'label_encoded']], "\n")


Step Encoded Labels:
     label  label_encoded
0  animal              0
1  animal              0
2  animal              0
3   human              1 



In [48]:
# Save label encoder and encoded labels
joblib.dump(label_encoder, 'step6_label_encoder.pkl')
df[['label', 'label_encoded']].to_csv('step6_label_encoded.csv', index=False)

In [49]:
# Step 7: TF-IDF Vectorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['processed_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print("Step TF-IDF Matrix (Top 5 rows):\n", tfidf_df.head(), "\n")

Step TF-IDF Matrix (Top 5 rows):
    balcony  barked     bird     book      cat  dog   flying   garden     hill  \
0  0.00000     0.0  0.00000  0.00000  0.57735  0.0  0.00000  0.57735  0.00000   
1  0.00000     0.5  0.00000  0.00000  0.00000  0.5  0.00000  0.00000  0.00000   
2  0.00000     0.0  0.57735  0.00000  0.00000  0.0  0.57735  0.00000  0.57735   
3  0.57735     0.0  0.00000  0.57735  0.00000  0.0  0.00000  0.00000  0.00000   

   loudly  night  reading  running  
0     0.0    0.0  0.00000  0.57735  
1     0.5    0.5  0.00000  0.00000  
2     0.0    0.0  0.00000  0.00000  
3     0.0    0.0  0.57735  0.00000   



In [50]:
# Save TF-IDF matrix and vectorizer
tfidf_df.to_csv('step7_tfidf_matrix.csv', index=False)
joblib.dump(tfidf, 'step7_tfidf_vectorizer.pkl')


['step7_tfidf_vectorizer.pkl']

In [51]:
# Final saved output summary
print(" All steps completed and outputs saved successfully!\n")
print(" Files Saved:")
print("- step4_cleaned_text.csv")
print("- step5_processed_text.csv")
print("- step6_label_encoded.csv")
print("- step6_label_encoder.pkl")
print("- step7_tfidf_matrix.csv")
print("- step7_tfidf_vectorizer.pkl")

 All steps completed and outputs saved successfully!

 Files Saved:
- step4_cleaned_text.csv
- step5_processed_text.csv
- step6_label_encoded.csv
- step6_label_encoder.pkl
- step7_tfidf_matrix.csv
- step7_tfidf_vectorizer.pkl
