In [1]:
import pandas as pd

df = pd.read_csv("labeled_news_data.csv")
df.to_csv("final_labeled_news_data.csv", index=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3448 entries, 0 to 3447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    3448 non-null   object
 1   Label   3448 non-null   object
dtypes: object(2)
memory usage: 54.0+ KB


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words("english"))


def clean_text(text):
    text = str(text).lower() 
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  
    text = re.sub(r"[^a-zA-Z\s]", "", text)  
    text = re.sub(r"\s+", " ", text).strip()  
    tokens = word_tokenize(text)  
    filtered_text = " ".join([word for word in tokens if word not in stop_words])  # Remove stopwords
    return filtered_text


df["Cleaned_Text"] = df["Text"].apply(clean_text)

df.to_csv("cleaned_news_data.csv", index=False)

df.head()


Unnamed: 0,Text,Label,Cleaned_Text
0,Plastic On The Mind: Assessing the Risks From ...,Unlabeled,plastic mind assessing risks micro nanoplastic...
1,Scenic loch becomes magnet for Scotland's plas...,Unlabeled,scenic loch becomes magnet scotlands plastic w...
2,Trump rolls back standards for water-using app...,Unlabeled,trump rolls back standards waterusing applianc...
3,Why Aren’t We Losing Our Minds Over the Plasti...,Unlabeled,arent losing minds plastic brains brains full ...
4,36 Products That’ll Help Get You Through TSA F...,Unlabeled,products thatll help get tsa faster cadence aa...


In [5]:
df["Cleaned_Text"] = df["Text"].apply(clean_text)

In [6]:
Clean_df = df[["Cleaned_Text", "Label"]]


Clean_df.to_csv("cleaned_news_data.csv", index=False)

In [7]:
Clean_df

Unnamed: 0,Cleaned_Text,Label
0,plastic mind assessing risks micro nanoplastic...,Unlabeled
1,scenic loch becomes magnet scotlands plastic w...,Unlabeled
2,trump rolls back standards waterusing applianc...,Unlabeled
3,arent losing minds plastic brains brains full ...,Unlabeled
4,products thatll help get tsa faster cadence aa...,Unlabeled
...,...,...
3443,new green revolution crop genetics vision peop...,Yes
3444,cant deny nuclear weapons stupid risk israeli ...,Unlabeled
3445,us study shows gm crops better twoyear study u...,Unlabeled
3446,monsanto hints uturn gm food britain monsanto ...,Unlabeled


In [8]:
stemmer = PorterStemmer()

def apply_stemming(text):
    tokens = word_tokenize(str(text))
    return " ".join([stemmer.stem(word) for word in tokens])


df_stemmed = Clean_df.copy()
df_stemmed["Stemmed_Text"] = df_stemmed["Cleaned_Text"].apply(apply_stemming)


df_stemmed = df_stemmed[["Stemmed_Text", "Label"]]
df_stemmed.to_csv("stemmed_news_data.csv", index=False)


df_stemmed.head()


Unnamed: 0,Stemmed_Text,Label
0,plastic mind assess risk micro nanoplast perha...,Unlabeled
1,scenic loch becom magnet scotland plastic wast...,Unlabeled
2,trump roll back standard waterus applianc ligh...,Unlabeled
3,arent lose mind plastic brain brain full plast...,Unlabeled
4,product thatll help get tsa faster cadenc aapi...,Unlabeled


In [9]:
lemmatizer = WordNetLemmatizer()


def apply_lemmatization(text):
    tokens = word_tokenize(str(text))
    return " ".join([lemmatizer.lemmatize(word) for word in tokens])


df_lemmatized = Clean_df.copy()
df_lemmatized["Lemmatized_Text"] = df_lemmatized["Cleaned_Text"].apply(apply_lemmatization)


df_lemmatized = df_lemmatized[["Lemmatized_Text", "Label"]]
df_lemmatized.to_csv("lemmatized_news_data.csv", index=False)

df_lemmatized.head()


Unnamed: 0,Lemmatized_Text,Label
0,plastic mind assessing risk micro nanoplastics...,Unlabeled
1,scenic loch becomes magnet scotland plastic wa...,Unlabeled
2,trump roll back standard waterusing appliance ...,Unlabeled
3,arent losing mind plastic brain brain full pla...,Unlabeled
4,product thatll help get tsa faster cadence aap...,Unlabeled


In [10]:
vectorizer = CountVectorizer(max_df=0.85, min_df=5, max_features=5000, stop_words="english")

X_counts = vectorizer.fit_transform(Clean_df["Cleaned_Text"])


df_countvectorizer = pd.DataFrame(X_counts.toarray(), columns=vectorizer.get_feature_names_out())

df_countvectorizer["Label"] = Clean_df["Label"].values

df_countvectorizer.to_csv("countvectorizer_news_data.csv", index=False)


df_countvectorizer.head()

Unnamed: 0,abandon,abandoned,ability,able,abroad,absence,absolute,absolutely,absorb,abundance,...,youth,youve,zealand,zero,zerocarbon,zerowaste,zone,zones,zoo,Label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Unlabeled
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Unlabeled
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Unlabeled
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Unlabeled
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Unlabeled


In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=5, max_features=5000, stop_words="english")

X_tfidf = tfidf_vectorizer.fit_transform(Clean_df["Cleaned_Text"])

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


df_tfidf["Label"] = Clean_df["Label"].values


df_tfidf.to_csv("tfidf_news_data.csv", index=False)


df_tfidf.head()


Unnamed: 0,abandon,abandoned,ability,able,abroad,absence,absolute,absolutely,absorb,abundance,...,youth,youve,zealand,zero,zerocarbon,zerowaste,zone,zones,zoo,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unlabeled
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unlabeled
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unlabeled
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unlabeled
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Unlabeled
