In [2]:
%%writefile iot_logs.txt
2024-12-14 09:30:21 INFO: Thermostat set to 22°C
2024-12-14 09:32:10 WARN: Temperature sensor reading delayed
Please turn on the living room lights.


Writing iot_logs.txt


In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize Lemmatizer and define stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Read the sample logs
with open('iot_logs.txt', 'r') as f:
    data = f.read()

print("Original Data:")
print(data)


text = data.lower()

# Step 2: Removing or handling special characters
# In IoT logs, we might want to keep numeric values. Let's remove punctuation but keep numbers and words.
text = text.replace('°c', ' degrees')  # Convert °C to 'degrees' for simplicity
text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # remove punctuation except numbers and letters

print("\nAfter Normalization and Removing Punctuation:")
print(text)

# Step 3: Tokenization
tokens = word_tokenize(text)
print("\nTokenized:")
print(tokens)

# Step 4: Stopwords Removal
filtered_tokens = [w for w in tokens if w not in stop_words]
print("\nAfter Stopwords Removal:")
print(filtered_tokens)

# Step 5: Lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]
print("\nAfter Lemmatization:")
print(lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original Data:
2024-12-14 09:30:21 INFO: Thermostat set to 22°C
2024-12-14 09:32:10 WARN: Temperature sensor reading delayed
Please turn on the living room lights.


After Normalization and Removing Punctuation:
20241214 093021 info thermostat set to 22 degrees
20241214 093210 warn temperature sensor reading delayed
please turn on the living room lights


Tokenized:
['20241214', '093021', 'info', 'thermostat', 'set', 'to', '22', 'degrees', '20241214', '093210', 'warn', 'temperature', 'sensor', 'reading', 'delayed', 'please', 'turn', 'on', 'the', 'living', 'room', 'lights']

After Stopwords Removal:
['20241214', '093021', 'info', 'thermostat', 'set', '22', 'degrees', '20241214', '093210', 'warn', 'temperature', 'sensor', 'reading', 'delayed', 'please', 'turn', 'living', 'room', 'lights']

After Lemmatization:
['20241214', '093021', 'info', 'thermostat', 'set', '22', 'degree', '20241214', '093210', 'warn', 'temperature', 'sensor', 'reading', 'delayed', 'please', 'turn', 'living', 'room',