In [17]:
import re
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [28]:
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer from spaCy
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [19]:
# Ensure NLTK stopwords and WordNetLemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
data = ['I love programming!', 'Python is amazing...', 'I enjoy solving problems.','i hate c#']
labels = ['positive', 'positive', 'positive', 'negative']

In [21]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [22]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove special characters, digits, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Lowercase the text
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data_cleaned = [clean_text(text) for text in data]

In [23]:
# Step 2: Lemmatization using WordNetLemmatizer
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

data_lemmatized = [lemmatize_text(text) for text in data_cleaned]

In [24]:
# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    words = text.split()
    words_filtered = [word for word in words if word not in stop_words]
    return " ".join(words_filtered)

data_no_stopwords = [remove_stop_words(text) for text in data_lemmatized]

In [25]:
# Step 4: Label Encoding
label_encoder = LabelEncoder()


labels_encoded = label_encoder.fit_transform(labels)

In [26]:
# Step 5: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_no_stopwords)

In [27]:
# Output the results
print("Cleaned and Lemmatized Text (no stopwords):")
print(data_no_stopwords)
print()
print("Encoded Labels:")
print(labels_encoded)
print()
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

Cleaned and Lemmatized Text (no stopwords):
['love programming', 'python amazing', 'enjoy solving problem', 'hate c']

Encoded Labels:
[1 1 1 0]

TF-IDF Matrix:
[[0.         0.         0.         0.70710678 0.         0.70710678
  0.         0.        ]
 [0.70710678 0.         0.         0.         0.         0.
  0.70710678 0.        ]
 [0.         0.57735027 0.         0.         0.57735027 0.
  0.         0.57735027]
 [0.         0.         1.         0.         0.         0.
  0.         0.        ]]
