<a href="https://colab.research.google.com/github/ShivaShirsath/nlp/blob/master/3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# load dataset
!wget https://github.com/PICT-NLP/BE-NLP-Elective/raw/main/3-Preprocessing/News_dataset.pickle
with open('News_dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)
print(dataset)

# convert dataset into pandas dataframe
df = pd.DataFrame(dataset, columns=['text', 'category'])
print(df.head())
# convert text column to string type
df['text'] = df['text'].astype(str)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


--2023-03-08 14:52:58--  https://github.com/PICT-NLP/BE-NLP-Elective/raw/main/3-Preprocessing/News_dataset.pickle
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PICT-NLP/BE-NLP-Elective/main/3-Preprocessing/News_dataset.pickle [following]
--2023-03-08 14:53:00--  https://raw.githubusercontent.com/PICT-NLP/BE-NLP-Elective/main/3-Preprocessing/News_dataset.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4980629 (4.7M) [application/octet-stream]
Saving to: ‘News_dataset.pickle’


2023-03-08 14:53:00 (299 MB/s) - ‘News_dataset.pickle’ saved [4980629/4980629]

     File_Name              

In [3]:
# text cleaning
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('[^\w\s]','')
df['text'] = df['text'].str.replace('\d+','')

  df['text'] = df['text'].str.replace('[^\w\s]','')
  df['text'] = df['text'].str.replace('\d+','')


In [4]:
# lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [5]:
# stop word removal
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
print(df['text'])

0       nan
1       nan
2       nan
3       nan
4       nan
       ... 
2220    nan
2221    nan
2222    nan
2223    nan
2224    nan
Name: text, Length: 2225, dtype: object


In [6]:
# label encoding
encoder = LabelEncoder()
df['category'] = encoder.fit_transform(df['category'])
print(df['category'])

0       0
1       0
2       0
3       0
4       0
       ..
2220    0
2221    0
2222    0
2223    0
2224    0
Name: category, Length: 2225, dtype: int64


In [7]:
# TF-IDF vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['text'])
print(X)

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 0)	1.0
  (4, 0)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 0)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 0)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 0)	1.0
  :	:
  (2200, 0)	1.0
  (2201, 0)	1.0
  (2202, 0)	1.0
  (2203, 0)	1.0
  (2204, 0)	1.0
  (2205, 0)	1.0
  (2206, 0)	1.0
  (2207, 0)	1.0
  (2208, 0)	1.0
  (2209, 0)	1.0
  (2210, 0)	1.0
  (2211, 0)	1.0
  (2212, 0)	1.0
  (2213, 0)	1.0
  (2214, 0)	1.0
  (2215, 0)	1.0
  (2216, 0)	1.0
  (2217, 0)	1.0
  (2218, 0)	1.0
  (2219, 0)	1.0
  (2220, 0)	1.0
  (2221, 0)	1.0
  (2222, 0)	1.0
  (2223, 0)	1.0
  (2224, 0)	1.0


In [8]:
# save outputs
np.save('X.npy', X.toarray())
np.save('y.npy', df['category'].to_numpy())