<a href="https://colab.research.google.com/github/ShivaShirsath/nlp/blob/master/3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

# load dataset
!wget https://github.com/PICT-NLP/BE-NLP-Elective/raw/main/3-Preprocessing/News_dataset.pickle
with open('News_dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)
print(dataset)

# convert dataset into pandas dataframe
df = pd.DataFrame(dataset, columns=['Content', 'category'])
print(df.head())
# convert text column to string type
df['Content'] = df['Content'].astype(str)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


--2023-03-08 16:56:59--  https://github.com/PICT-NLP/BE-NLP-Elective/raw/main/3-Preprocessing/News_dataset.pickle
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/PICT-NLP/BE-NLP-Elective/main/3-Preprocessing/News_dataset.pickle [following]
--2023-03-08 16:57:01--  https://raw.githubusercontent.com/PICT-NLP/BE-NLP-Elective/main/3-Preprocessing/News_dataset.pickle
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4980629 (4.7M) [application/octet-stream]
Saving to: ‘News_dataset.pickle.1’


2023-03-08 16:57:01 (92.5 MB/s) - ‘News_dataset.pickle.1’ saved [4980629/4980629]

     File_Name         

In [4]:
# text cleaning
df['Content'] = df['Content'].str.lower()
df['Content'] = df['Content'].str.replace('[^\w\s]','')
df['Content'] = df['Content'].str.replace('\d+','')

  df['Content'] = df['Content'].str.replace('[^\w\s]','')
  df['Content'] = df['Content'].str.replace('\d+','')


In [5]:
# lemmatization
lemmatizer = WordNetLemmatizer()
df['Content'] = df['Content'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [6]:
# stop word removal
stop_words = set(stopwords.words('english'))
df['Content'] = df['Content'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
print(df['Content'])

0       ad sale boost time warner profit quarterly pro...
1       dollar gain greenspan speech dollar ha hit hig...
2       yukos unit buyer face loan claim owner embattl...
3       high fuel price hit ba profit british airway h...
4       pernod takeover talk lift domecq share uk drin...
                              ...                        
2220    bt program beat dialler scam bt introducing tw...
2221    spam email tempt net shopper computer user acr...
2222    careful code new european directive could put ...
2223    u cyber security chief resigns man making sure...
2224    losing online gaming online role playing game ...
Name: Content, Length: 2225, dtype: object


In [7]:
# label encoding
encoder = LabelEncoder()
df['category'] = encoder.fit_transform(df['category'])
print(df['category'])

0       0
1       0
2       0
3       0
4       0
       ..
2220    0
2221    0
2222    0
2223    0
2224    0
Name: category, Length: 2225, dtype: int64


In [8]:
# TF-IDF vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Content'])
print(X)

  (0, 26414)	0.033779095839948876
  (0, 14531)	0.035679398862105735
  (0, 2834)	0.03497465413990564
  (0, 20616)	0.0333300422119466
  (0, 8209)	0.056540162120040066
  (0, 23528)	0.12250717057240096
  (0, 19589)	0.043941859745335654
  (0, 2381)	0.07119523633370005
  (0, 19546)	0.04807566061589116
  (0, 16330)	0.02877989870995396
  (0, 10065)	0.03768197016859518
  (0, 26960)	0.020738517863256638
  (0, 311)	0.06143631265777171
  (0, 12499)	0.05074147624778164
  (0, 19217)	0.03609651563636007
  (0, 20681)	0.04315174321728093
  (0, 14016)	0.03205054694793624
  (0, 1452)	0.04577043430820242
  (0, 22174)	0.04366836388755726
  (0, 16553)	0.03162746912538339
  (0, 876)	0.03425206447384327
  (0, 8166)	0.04054915173228801
  (0, 25817)	0.043941859745335654
  (0, 20876)	0.037888559831885604
  (0, 6043)	0.055375890220680846
  :	:
  (2224, 17224)	0.02221419561911395
  (2224, 18008)	0.014226840610686036
  (2224, 163)	0.010337060435573104
  (2224, 1359)	0.016345009186111755
  (2224, 10586)	0.0180843391

In [12]:
# save outputs
np.save('X.npy', X.toarray())
np.save('y.npy', df['category'].to_numpy())
print("Saved : ", np.load('X.npy'), sep='\n')

Saved : 
[[0.         0.         0.         ... 0.02701782 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.03502724 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02526516 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
