# NLP Assignment 3 - To perform text cleaning, lemmatization, get stop words and create representations using TF-IDF

In [1]:
!pip install nltk scikit-learn pandas



In [2]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# 1. Reading sentiment analysis dataset

In [4]:
df = pd.read_csv('/content/sentiment_analysis.csv')
df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


# 2. Cleaning textual data

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform,clean_text
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter,what a great day looks like dream
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook,i feel sorry i miss you here in the sea beach
2,2017,8,18,night,Don't angry me,negative,Facebook,dont angry me
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook,we attend in the class just for listening teac...
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram,those who want to go let them go


# 3. Stop Words

In [7]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)

# 4. Lemmatization using word net

In [8]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)

# 5. Label Encoding of the Sentiment column

In [10]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['sentiment'])

# 6. TF-IDF Vectorizer

In [11]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

X_tfidf = tfidf.fit_transform(df['lemmatized_text'])

In [12]:
tfidf_df = pd.DataFrame(
    X_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

# 7. Saving the output to csv files

In [13]:
df.to_csv("processed_text_data.csv", index=False)

tfidf_df.to_csv("tfidf_features.csv", index=False)

df[['label_encoded']].to_csv("labels.csv", index=False)
