In [1]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')


In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newsgroup = fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'))

In [4]:
texts = newsgroup.data
labels = newsgroup.target
label_names = newsgroup.target_names

In [5]:
texts[0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [6]:
labels[0]

np.int32(10)

In [7]:
label_names[0]

'alt.atheism'

In [8]:
len(texts), len(labels), len(label_names)

(18846, 18846, 20)

In [9]:
df = pd.DataFrame(columns=['text','label','category'])
df['text'] = texts
df['label'] = labels.astype(int)
df['category'] = [label_names[i] for i in labels]
df.head()

Unnamed: 0,text,label,category
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware


In [10]:
df['text'][0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [11]:
df['category'].nunique()

20

In [12]:
df['category'].unique()

array(['rec.sport.hockey', 'comp.sys.ibm.pc.hardware',
       'talk.politics.mideast', 'comp.sys.mac.hardware',
       'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med',
       'alt.atheism', 'rec.motorcycles', 'rec.autos', 'comp.windows.x',
       'comp.graphics', 'sci.space', 'talk.politics.guns', 'misc.forsale',
       'rec.sport.baseball', 'talk.politics.misc',
       'comp.os.ms-windows.misc', 'soc.religion.christian'], dtype=object)

In [13]:
puncts = string.punctuation
puncts

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
data = df.copy()

In [15]:
def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def to_lower(text):
    return text.lower()    

In [16]:
data['text'] = data['text'].apply(remove_punctuations)
data['text'] = data['text'].apply(to_lower)
data['text'][0]

'\n\ni am sure some bashers of pens fans are pretty confused about the lack\nof any kind of posts about the recent pens massacre of the devils actually\ni am  bit puzzled too and a bit relieved however i am going to put an end\nto nonpittsburghers relief with a bit of praise for the pens man they\nare killing those devils worse than i thought jagr just showed you why\nhe is much better than his regular season stats he is also a lot\nfo fun to watch in the playoffs bowman should let jagr have a lot of\nfun in the next couple of games since the pens are going to beat the pulp out of jersey anyway i was very disappointed not to see the islanders lose the final\nregular season game          pens rule\n\n'

In [17]:
stop_words = stopwords.words('english')

def remove_stopwords(text):
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [18]:
data['text'] = data['text'].apply(remove_stopwords)

In [19]:
data['text'][0]

'sure bashers pens fans pretty confused lack kind posts recent pens massacre devils actually bit puzzled bit relieved however going put end nonpittsburghers relief bit praise pens man killing devils worse thought jagr showed much better regular season stats also lot fo fun watch playoffs bowman let jagr lot fun next couple games since pens going beat pulp jersey anyway disappointed see islanders lose final regular season game pens rule'

In [20]:
merged_labels = {
    'alt.atheism': 'religion',
    'soc.religion.christian': 'religion',
    'talk.religion.misc': 'religion',

    'comp.graphics': 'computers',
    'comp.os.ms-windows.misc': 'computers',
    'comp.sys.ibm.pc.hardware': 'computers',
    'comp.sys.mac.hardware': 'computers',
    'comp.windows.x': 'computers',

    'sci.crypt': 'science',
    'sci.electronics': 'science',
    'sci.med': 'science',
    'sci.space': 'science',

    'rec.autos': 'vehicles',
    'rec.motorcycles': 'vehicles',

    'rec.sport.baseball': 'sports',
    'rec.sport.hockey': 'sports',

    'talk.politics.guns': 'politics',
    'talk.politics.mideast': 'politics',
    'talk.politics.misc': 'politics',

    'misc.forsale': 'misc'
}

In [21]:
df['merged_category'] = df['category'].map(merged_labels)

In [22]:
df.head()

Unnamed: 0,text,label,category,merged_category
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,sports
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware,computers
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast,politics
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware,computers
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware,computers


In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
enc = LabelEncoder()
df['merged_label'] = enc.fit_transform(df['merged_category'])

In [25]:
df.head()

Unnamed: 0,text,label,category,merged_category,merged_label
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey,sports,5
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware,computers,0
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast,politics,2
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware,computers,0
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware,computers,0


In [26]:
data['label'] = df['merged_label']
data['category'] = df['merged_category']

In [27]:
data.to_csv('C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/data/data.csv', index=False)

In [28]:
reverse_mapping = dict(zip(range(len(enc.classes_)), enc.classes_))
print(reverse_mapping)

{0: 'computers', 1: 'misc', 2: 'politics', 3: 'religion', 4: 'science', 5: 'sports', 6: 'vehicles'}


In [30]:
import joblib
joblib.dump(reverse_mapping, "C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/data/label_mapping.joblib")

['C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/data/label_mapping.joblib']