**Perform text cleaning, perform lemmatization (any method), remove stop words (any method),
label encoding. Create representations using TF-IDF. Save outputs**

In [None]:
# Install necessary libraries
!pip install scikit-learn gensim nltk



In [None]:
# Import required libraries
import nltk
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [None]:
# Download NLTK resources (stopwords and wordnet)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Sample data (documents and their labels)
documents = [
    "I love programming in Python!",
    "Python is a great programming language.",
    "I enjoy coding in Python every day.",
    "Programming is fun and exciting.",
    "I prefer Python over other programming languages."
]
labels = ["positive", "positive", "positive", "neutral", "positive"]


In [None]:

# Step 1: Text Cleaning (remove punctuation and convert to lowercase)
def clean_text(text):
    """
    Function to clean the text by:
    1. Converting it to lowercase
    2. Removing punctuation
    """
    text = text.lower()  # Convert text to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    return text

# Apply the `clean_text` function to the documents
documents_cleaned = [clean_text(doc) for doc in documents]

# Output for Step 1: Cleaned Documents
print("Step 1: Cleaned Text (Lowercased and Punctuation Removed)")
print(documents_cleaned)


Step 1: Cleaned Text (Lowercased and Punctuation Removed)
['i love programming in python', 'python is a great programming language', 'i enjoy coding in python every day', 'programming is fun and exciting', 'i prefer python over other programming languages']


In [None]:
# Step 2: Remove Stop Words
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    """
    Remove stopwords from the text.
    """
    return ' '.join([word for word in text.split() if word not in stop_words])

documents_no_stopwords = [remove_stop_words(doc) for doc in documents_cleaned]

# Output for Step 2: Documents After Stop Word Removal
print("\nStep 2: Text After Stop Word Removal")
print(documents_no_stopwords)



Step 2: Text After Stop Word Removal
['love programming python', 'python great programming language', 'enjoy coding python every day', 'programming fun exciting', 'prefer python programming languages']


In [None]:
# Step 3: Lemmatization (use WordNetLemmatizer)
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    """
    Lemmatize the words in the text.
    """
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

documents_lemmatized = [lemmatize_text(doc) for doc in documents_no_stopwords]

# Output for Step 3: Lemmatized Text
print("\nStep 3: Lemmatized Text")
print(documents_lemmatized)



Step 3: Lemmatized Text
['love programming python', 'python great programming language', 'enjoy coding python every day', 'programming fun exciting', 'prefer python programming language']


In [None]:
# Step 4: Label Encoding (Convert labels to numeric values)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Output for Step 4: Encoded Labels
print("\nStep 4: Encoded Labels")
print(pd.DataFrame({'Original Labels': labels, 'Encoded Labels': labels_encoded}))



Step 4: Encoded Labels
  Original Labels  Encoded Labels
0        positive               1
1        positive               1
2        positive               1
3         neutral               0
4        positive               1


In [None]:
# Step 5: TF-IDF Representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents_lemmatized)

# Output for Step 5: TF-IDF Matrix
print("\nStep 5: TF-IDF Matrix")
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(df_tfidf)



Step 5: TF-IDF Matrix
    coding      day    enjoy    every  exciting     fun     great  language  \
0  0.00000  0.00000  0.00000  0.00000    0.0000  0.0000  0.000000  0.000000   
1  0.00000  0.00000  0.00000  0.00000    0.0000  0.0000  0.661438  0.533644   
2  0.48127  0.48127  0.48127  0.48127    0.0000  0.0000  0.000000  0.000000   
3  0.00000  0.00000  0.00000  0.00000    0.6569  0.6569  0.000000  0.000000   
4  0.00000  0.00000  0.00000  0.00000    0.0000  0.0000  0.000000  0.533644   

      love    prefer  programming    python  
0  0.78211  0.000000     0.440627  0.440627  
1  0.00000  0.000000     0.372642  0.372642  
2  0.00000  0.000000     0.000000  0.271139  
3  0.00000  0.000000     0.370086  0.000000  
4  0.00000  0.661438     0.372642  0.372642  


In [None]:
# Save outputs to files

# Save cleaned text and lemmatized text to CSV
df_cleaned = pd.DataFrame({'Original': documents, 'Cleaned': documents_lemmatized})
df_cleaned.to_csv('cleaned_text.csv', index=False)

# Save the TF-IDF matrix as a CSV
df_tfidf.to_csv('tfidf_matrix.csv', index=False)

# Save label-encoded labels to CSV
df_labels = pd.DataFrame({'Original Labels': labels, 'Encoded Labels': labels_encoded})
df_labels.to_csv('encoded_labels.csv', index=False)

# Optionally, download the files in Google Colab
from google.colab import files
files.download('cleaned_text.csv')
files.download('tfidf_matrix.csv')
files.download('encoded_labels.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>