In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


In [None]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Sample data
data = {'text': ['I love programming in Python!', 'Python is great for data science.', 'I hate bugs in my code.'],
        'label': ['positive', 'positive', 'negative']}

# Convert data into DataFrame
df = pd.DataFrame(data)
df.head()


Unnamed: 0,text,label
0,I love programming in Python!,positive
1,Python is great for data science.,positive
2,I hate bugs in my code.,negative


In [None]:
# Test Cleaning Function
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Example of clean_text function
print(clean_text("I love programming in Python!"))


I love programming in Python


In [None]:
import nltk
from nltk.corpus import stopwords

# Download stopwords from NLTK
nltk.download('stopwords')

# Define stop_words set
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download stopwords from NLTK
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Lemmatization and Stopwords Removal Function
def preprocess_text(text):
    text = clean_text(text)
    words = text.split()
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    return " ".join(words)

# Example of preprocess_text function
print(preprocess_text("I love programming in Python!"))

love programming python


In [None]:
# Apply preprocessing to the text column
df['cleaned_text'] = df['text'].apply(preprocess_text)
df.head()


Unnamed: 0,text,label,cleaned_text
0,I love programming in Python!,positive,love programming python
1,Python is great for data science.,positive,python great data science
2,I hate bugs in my code.,negative,hate bug code


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])
df.head()


Unnamed: 0,text,label,cleaned_text,encoded_label
0,I love programming in Python!,positive,love programming python,1
1,Python is great for data science.,positive,python great data science,1
2,I hate bugs in my code.,negative,hate bug code,0


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Convert TF-IDF to DataFrame for easy inspection
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df


Unnamed: 0,bug,code,data,great,hate,love,programming,python,science
0,0.0,0.0,0.0,0.0,0.0,0.622766,0.622766,0.47363,0.0
1,0.0,0.0,0.528635,0.528635,0.0,0.0,0.0,0.40204,0.528635
2,0.57735,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0,0.0


In [None]:
print("Cleaned DataFrame:\n", df[['text', 'cleaned_text', 'label', 'encoded_label']], "\n")
print("TF-IDF Features:\n", tfidf_df)


Cleaned DataFrame:
                                 text               cleaned_text     label  \
0      I love programming in Python!    love programming python  positive   
1  Python is great for data science.  python great data science  positive   
2            I hate bugs in my code.              hate bug code  negative   

   encoded_label  
0              1  
1              1  
2              0   

TF-IDF Features:
        bug     code      data     great     hate      love  programming  \
0  0.00000  0.00000  0.000000  0.000000  0.00000  0.622766     0.622766   
1  0.00000  0.00000  0.528635  0.528635  0.00000  0.000000     0.000000   
2  0.57735  0.57735  0.000000  0.000000  0.57735  0.000000     0.000000   

    python   science  
0  0.47363  0.000000  
1  0.40204  0.528635  
2  0.00000  0.000000  
