<a href="https://colab.research.google.com/github/SanketDevmunde/NLP_ASSIGNMENT/blob/main/NLP_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install nltk scikit-learn

import nltk
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer



In [None]:
# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Sample dataset
data = {'Text': ["I love programming in Python!", "Natural Language Processing is amazing.", "Machine learning models require data preprocessing."],
        'Category': ['Positive', 'Positive', 'Neutral']}
df = pd.DataFrame(data)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Apply lemmatization to the 'Text' column and store the result in 'Lemmatized_Text'
df['Lemmatized_Text'] = df['Text'].apply(lemmatize_text)

In [None]:
# Remove Stop Words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['Processed_Text'] = df['Lemmatized_Text'].apply(remove_stopwords)


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])


In [None]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Text'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [None]:
# Save outputs
df.to_csv('processed_text_data.csv', index=False)
tfidf_df.to_csv('tfidf_representation.csv', index=False)

In [None]:
# Display results
df, tfidf_df

(                                                Text  Category  \
 0                      I love programming in Python!  Positive   
 1            Natural Language Processing is amazing.  Positive   
 2  Machine learning models require data preproces...   Neutral   
 
                                      Lemmatized_Text  \
 0                      I love programming in Python!   
 1            Natural Language Processing is amazing.   
 2  Machine learning model require data preprocess...   
 
                                       Processed_Text  Category_Label  
 0                         I love programming Python!               1  
 1               Natural Language Processing amazing.               1  
 2  Machine learning model require data preprocess...               0  ,
    amazing      data  language  learning     love   machine     model  \
 0      0.0  0.000000       0.0  0.000000  0.57735  0.000000  0.000000   
 1      0.5  0.000000       0.5  0.000000  0.00000  0.000000  0