In [1]:
# Loading Packages
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd


In [10]:
%%time

import os

docs=[]
labels=[]
label_index={}

PATH=os.getcwd()

text_data_dir=os.path.join(PATH,'20_newsgroups')
for name in os.listdir(text_data_dir):
    path=os.path.join(text_data_dir,name)
    if os.path.isdir(path):
        label_id=len(label_index)
        label_index[label_id]=name
        for fname in sorted(os.listdir(path)):
            fpath=os.path.join(path,fname)
            f=open(fpath,encoding="ISO-8859-1")
            t=f.read()
            docs.append(t)
            f.close()
            labels.append(label_id)

print('Found %s docs.' %len(docs))
        


Found 19997 docs.
Wall time: 2.17 s


In [14]:
label_index

{0: 'alt.atheism',
 1: 'comp.graphics',
 2: 'comp.os.ms-windows.misc',
 3: 'comp.sys.ibm.pc.hardware',
 4: 'comp.sys.mac.hardware',
 5: 'comp.windows.x',
 6: 'misc.forsale',
 7: 'rec.autos',
 8: 'rec.motorcycles',
 9: 'rec.sport.baseball',
 10: 'rec.sport.hockey',
 11: 'sci.crypt',
 12: 'sci.electronics',
 13: 'sci.med',
 14: 'sci.space',
 15: 'soc.religion.christian',
 16: 'talk.politics.guns',
 17: 'talk.politics.mideast',
 18: 'talk.politics.misc',
 19: 'talk.religion.misc'}

In [28]:
data=pd.DataFrame(docs)

In [30]:
data.head()

Unnamed: 0,0
0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
2,Newsgroups: alt.atheism\nPath: cantaloupe.srv....
3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...


In [31]:
data['target']=labels

In [34]:
data.columns=['text','target']

In [35]:
data.shape

(19997, 2)

In [36]:
#Duplicating the original text extracted before proceeeding with preprocessing steps

import copy
print(type(data['text']))
original_data = copy.deepcopy(data)
print(data.keys())
print(original_data.keys())

<class 'pandas.core.series.Series'>
Index(['text', 'target'], dtype='object')
Index(['text', 'target'], dtype='object')


## Basic cleaning of text

### LowerCase all text

In [37]:
data['text'] = [text.strip().lower() for text in data['text']]
data['text'][:10]

0    xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...
1    xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
2    newsgroups: alt.atheism\npath: cantaloupe.srv....
3    xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
4    xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
5    newsgroups: alt.atheism\npath: cantaloupe.srv....
6    path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....
7    newsgroups: alt.atheism\npath: cantaloupe.srv....
8    path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....
9    path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....
Name: text, dtype: object

## Defining the functions to perform basic steps like 

- **expanding contractions**
 
- **remove accented characters**

- **scrub words**

In [103]:
type(data['text'])

list

###  Handling contractions 

In [39]:
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [40]:
def expand_contractions(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

In [42]:
import re
data['text'] = [expand_contractions(re.sub('’', "'", text)) for text in data['text']]
data['text'][1]

'xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51060 alt.atheism.moderated:727 news.answers:7300 alt.answers:155\npath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!agate!netsys!ibmpcug!mantis!mathew\nfrom: mathew <mathew@mantis.co.uk>\nnewsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers\nsubject: alt.atheism faq: introduction to atheism\nsummary: please read this file before posting to alt.atheism\nkeywords: faq, atheism\nmessage-id: <19930405122245@mantis.co.uk>\ndate: mon, 5 apr 1993 12:22:45 gmt\nexpires: thu, 6 may 1993 12:22:45 gmt\nfollowup-to: alt.atheism\ndistribution: world\norganization: mantis consultants, cambridge. uk.\napproved: news-answers-request@mit.edu\nsupersedes: <19930308134439@mantis.co.uk>\nlines: 646\n\narchive-name: atheism/introduction\nalt-atheism-archive-name: introduction\nlast-modified: 5 april 1993\nversion: 1.2\n\n-----begin pgp signed message-----\n\n                  

## Invoking the remove_accented_chars() function

In [43]:
data['text'][2]

'newsgroups: alt.atheism\npath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!usc!sdd.hp.com!nigel.msen.com!yale.edu!ira.uka.de!news.dfn.de!tubsibr!dbstu1.rz.tu-bs.de!i3150101\nfrom: i3150101@dbstu1.rz.tu-bs.de (benedikt rosenau)\nsubject: re: gospel dating\nmessage-id: <16ba711b3a.i3150101@dbstu1.rz.tu-bs.de>\nsender: postnntp@ibr.cs.tu-bs.de (mr. nntp inews entry)\norganization: technical university braunschweig, germany\nreferences: <16ba1e197.i3150101@dbstu1.rz.tu-bs.de> <65974@mimsy.umd.edu>\ndate: mon, 5 apr 1993 19:08:25 gmt\nlines: 93\n\nin article <65974@mimsy.umd.edu>\nmangoe@cs.umd.edu (charley wingate) writes:\n \n>>well, john has a quite different, not necessarily more elaborated theology.\n>>there is some evidence that he must have known luke, and that the content\n>>of q was known to him, but not in a \'canonized\' form.\n>\n>this is a new argument to me.  could you elaborate a little?\n>\n \nthe argument 

In [44]:
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text

In [45]:
data['text'] = [remove_accented_chars(text) for text in data['text']]
data['text'][2]

'newsgroups: alt.atheism\npath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!usc!sdd.hp.com!nigel.msen.com!yale.edu!ira.uka.de!news.dfn.de!tubsibr!dbstu1.rz.tu-bs.de!i3150101\nfrom: i3150101@dbstu1.rz.tu-bs.de (benedikt rosenau)\nsubject: re: gospel dating\nmessage-id: <16ba711b3a.i3150101@dbstu1.rz.tu-bs.de>\nsender: postnntp@ibr.cs.tu-bs.de (mr. nntp inews entry)\norganization: technical university braunschweig, germany\nreferences: <16ba1e197.i3150101@dbstu1.rz.tu-bs.de> <65974@mimsy.umd.edu>\ndate: mon, 5 apr 1993 19:08:25 gmt\nlines: 93\n\nin article <65974@mimsy.umd.edu>\nmangoe@cs.umd.edu (charley wingate) writes:\n \n>>well, john has a quite different, not necessarily more elaborated theology.\n>>there is some evidence that he must have known luke, and that the content\n>>of q was known to him, but not in a \'canonized\' form.\n>\n>this is a new argument to me.  could you elaborate a little?\n>\n \nthe argument 

## Invoking various scrub functions

In [46]:
def scrub_words(text):
    #Replace \xao characters in text
    text = re.sub('\xa0', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
    
    #Replace new line characters and following text untill space
    text = re.sub('\n(\w*?)[\s]', '', text)
    
    #Remove html markup
    text = re.sub("<.*?>", ' ', text)
    
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    return text

In [48]:
data['text'] = [scrub_words(text) for text in data['text']]
data['text'][1]

'xref cantaloupe srv cs cmu edu alt atheism alt atheism moderated news answers alt answers path cantaloupe srv cs cmu edu crabapple srv cs cmu edu fs ece cmu edu europa eng gtefsd com howland reston ans net agate netsys ibmpcug mantis mathew from mathew mathew mantis co uk newsgroups alt atheism alt atheism moderated news answers alt answers subject alt atheism faq introduction to atheism summary please read this file before posting to alt atheism keywords faq atheism message id mantis co uk date mon apr gmt expires thu may gmt followup to alt atheism distribution world organization mantis consultants cambridge uk approved news answers request mit edu supersedes mantis co uk lines archive name atheism introduction alt atheism archive name introduction last modified april version begin pgp signed message an introduction to atheism by mathew mathew mantis co uk this article attempts to provide a general introduction to atheism whilst i have tried to be as neutral as possible regarding co

## Checking the integrity of the data after initial preprocessing steps

In [49]:
print("Data Type: ",type(original_data['text']))
print("Data Type: ",type(data['text']))

print("Length of data: ",len(original_data['text']))
print("Length of data: ",len(data['text']))

print("Original data: \n",original_data['text'][0])
print("\n\n**************************************************************************\n\n")
print("Clean data: \n",data['text'][0])

Data Type:  <class 'pandas.core.series.Series'>
Data Type:  <class 'pandas.core.series.Series'>
Length of data:  19997
Length of data:  19997
Original data: 
 Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49960 alt.atheism.moderated:713 news.answers:7054 alt.answers:126
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!magnus.acs.ohio-state.edu!usenet.ins.cwru.edu!agate!spool.mu.edu!uunet!pipex!ibmpcug!mantis!mathew
From: mathew <mathew@mantis.co.uk>
Newsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers
Subject: Alt.Atheism FAQ: Atheist Resources
Summary: Books, addresses, music -- anything related to atheism
Keywords: FAQ, atheism, books, music, fiction, addresses, contacts
Message-ID: <19930329115719@mantis.co.uk>
Date: Mon, 29 Mar 1993 11:57:19 GMT
Expires: Thu, 29 Apr 1993 11:57:19 GMT
Followup-To: alt.atheism
Distribution: world
Organization: Mantis Consultants, Cambridge. UK.
Approved: news-answers-reque

In [50]:
print("Original data: \n",original_data['text'][1])
print("\n\n**************************************************************************\n\n")
print("Clean data: \n",data['text'][1])

Original data: 
 Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51060 alt.atheism.moderated:727 news.answers:7300 alt.answers:155
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!agate!netsys!ibmpcug!mantis!mathew
From: mathew <mathew@mantis.co.uk>
Newsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers
Subject: Alt.Atheism FAQ: Introduction to Atheism
Summary: Please read this file before posting to alt.atheism
Keywords: FAQ, atheism
Message-ID: <19930405122245@mantis.co.uk>
Date: Mon, 5 Apr 1993 12:22:45 GMT
Expires: Thu, 6 May 1993 12:22:45 GMT
Followup-To: alt.atheism
Distribution: world
Organization: Mantis Consultants, Cambridge. UK.
Approved: news-answers-request@mit.edu
Supersedes: <19930308134439@mantis.co.uk>
Lines: 646

Archive-name: atheism/introduction
Alt-atheism-archive-name: introduction
Last-modified: 5 April 1993
Version: 1.2

-----BEGIN PGP SIGNED MESSAGE-----

                          

# Text Preprocessing

#### Adding new column "word_count" which specifies the number of tokens in each document

In [51]:
data['word_count'] = [len(text.split(' ')) for text in data['text']]
pd.DataFrame(data['word_count']).describe()

Unnamed: 0,word_count
count,19997.0
mean,369.937991
std,724.689806
min,48.0
25%,178.0
50%,252.0
75%,372.0
max,39436.0


#### Converting the dictionary to Dataframe 

Converting dictionary to dataframe as pandas provide better and readable subsetting options

In [52]:
data.keys()

Index(['text', 'target', 'word_count'], dtype='object')

In [53]:
data.head()

Unnamed: 0,text,target,word_count
0,xref cantaloupe srv cs cmu edu alt atheism alt...,0,1772
1,xref cantaloupe srv cs cmu edu alt atheism alt...,0,5425
2,newsgroups alt atheism path cantaloupe srv cs ...,0,806
3,xref cantaloupe srv cs cmu edu alt atheism alt...,0,325
4,xref cantaloupe srv cs cmu edu alt atheism soc...,0,206


In [54]:
news_df = pd.DataFrame(data)
print("Shape: ",news_df.shape)
news_df.head(5)

Shape:  (19997, 3)


Unnamed: 0,text,target,word_count
0,xref cantaloupe srv cs cmu edu alt atheism alt...,0,1772
1,xref cantaloupe srv cs cmu edu alt atheism alt...,0,5425
2,newsgroups alt atheism path cantaloupe srv cs ...,0,806
3,xref cantaloupe srv cs cmu edu alt atheism alt...,0,325
4,xref cantaloupe srv cs cmu edu alt atheism soc...,0,206


#### Removing all the blogs with words_count value less than first quartile (25%) of words_count attribute

In [55]:
## Getting the first quartile value
q1 = np.percentile(news_df.word_count,25)
print(f"The first quartile value of words_count attribute is {q1}")

The first quartile value of words_count attribute is 178.0


In [56]:
news_df = news_df[news_df['word_count'] > q1]
print(f"The shape of trimmed blogs dataframe is {news_df.shape}")

The shape of trimmed blogs dataframe is (14996, 3)


#### Converting dataframe back to dictionary

In [57]:
data = news_df.reset_index().to_dict(orient='list')
print(f"The keys in the dictionary are {data.keys()}")

The keys in the dictionary are dict_keys(['index', 'text', 'target', 'word_count'])


In [58]:
print(data['text'][5])

newsgroups alt atheism path cantaloupe srv cs cmu edu crabapple srv cs cmu edu fs ece cmu edu europa eng gtefsd com howland reston ans net usc sdd hp com nigel msen com yale edu ira uka de news dfn de tubsibr dbstu rz tu bs de i from i dbstu rz tu bs de benedikt rosenau subject re a visit from the jehovah s witnesses message id ba ef i dbstu rz tu bs de sender postnntp ibr cs tu bs de mr nntp inews entry organization technical university braunschweig germany references bskendigc kd z cdc netcom com p v ainn e matt ksu ksu edu ba da i dbstu rz tu bs de apr batman bmd trw com date mon apr gmt lines in article apr batman bmd trw com jbrown batman bmd trw com writes did not you say lucifer was created with a perfect nature yes define perfect then i think you are playing the usual game here make sweeping statements like omni holy or perfect and do not note that they mean exactly what they say and that says that you must not use this terms when it leads to contradictions i m not trying to pl

In [59]:
type(data['text'])

list

## Stopwords, stemming, and tokenizing

In [61]:
#!conda install -c conda-forge spacy
#!python -m spacy download en_core_web_sm
#!pip install -U spacy
import spacy
nlp = spacy.load("en_core_web_sm")

# import en_core_web_sm
#nlp = en_core_web_sm.load()

In [63]:
data['text'][1]

'xref cantaloupe srv cs cmu edu alt atheism alt atheism moderated news answers alt answers path cantaloupe srv cs cmu edu crabapple srv cs cmu edu fs ece cmu edu europa eng gtefsd com howland reston ans net agate netsys ibmpcug mantis mathew from mathew mathew mantis co uk newsgroups alt atheism alt atheism moderated news answers alt answers subject alt atheism faq introduction to atheism summary please read this file before posting to alt atheism keywords faq atheism message id mantis co uk date mon apr gmt expires thu may gmt followup to alt atheism distribution world organization mantis consultants cambridge uk approved news answers request mit edu supersedes mantis co uk lines archive name atheism introduction alt atheism archive name introduction last modified april version begin pgp signed message an introduction to atheism by mathew mathew mantis co uk this article attempts to provide a general introduction to atheism whilst i have tried to be as neutral as possible regarding co

In [65]:
## Adding Custom stopwords to the spacy stopword list
customize_stop_words = ['xref']

for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [66]:
## It might be surprising, but spaCy doesn't contain any function for stemming as it relies on lemmatization only. 
## Therefore, in this section, we will use NLTK for stemming.

## load nltk's SnowballStemmer as variable 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [67]:
# Here I define a tokenizer and stemmer which returns the set of stems (excluding stop words) in the text that it is passed

def tokenize_and_stem(doc, remove_stopwords = True):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    if remove_stopwords:
        tokens = [word.text for word in doc if not word.is_stop]
    else:
        tokens = [word.text for word in doc]

    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_and_lemmatize(doc, remove_stopwords = True):
    
    if remove_stopwords:
        tokens = [word for word in doc if not word.is_stop]
    else:
        tokens = [word for word in doc]
        
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token.text):
            filtered_tokens.append(token)
            
    lemma = [t.lemma_ for t in filtered_tokens]
    return lemma


def tokenize_only(doc, remove_stopwords = True):
    
    if remove_stopwords:
        tokens = [word.text for word in doc if not word.is_stop]
    else:
        tokens = [word.text for word in doc]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [68]:
%%time
## We are trying to create four seperate lists for text with stop words, text without stop words,
## text with stemmed words and text with lemmatized words.

## Naming Conventions followed ####

## 'clean' word is appended to lists which do not contain stopwords

## 'all' keyword is appended to lists which contain stopwords.

## use extend so it's a big flat list of vocab

data['clean_text_stemmed'] = []
data['clean_text_lemmatized'] = []
data['text_stemmed'] = []
data['text_lemmatized'] = []

vocab_stemmed = []
allvocab_stemmed =[]

vocab_tokenized = []
allvocab_tokenized = []

vocab_lemmatized = []
allvocab_lemmatized = []


for idx,text in enumerate(data['text']):

    doc = nlp(text)
    print(f"processing {idx} document")
    words_stemmed = tokenize_and_stem(doc)
    vocab_stemmed.extend(words_stemmed)
    data['clean_text_stemmed'].append(words_stemmed)
        
    words_lemmatized = tokenize_and_lemmatize(doc)
    vocab_lemmatized.extend(words_lemmatized)
    data['clean_text_lemmatized'].append(words_lemmatized)
    
       
    allwords_stemmed = tokenize_and_stem(doc, False)
    allvocab_stemmed.extend(allwords_stemmed)
    data['text_stemmed'].append(allwords_stemmed)
    
    allwords_lemmatized = tokenize_and_lemmatize(doc, False)
    allvocab_lemmatized.extend(allwords_lemmatized)
    data['text_lemmatized'].append(allwords_lemmatized)
    
    allwords_tokenized = tokenize_only(doc,False)
    allvocab_tokenized.extend(allwords_tokenized)
    
    words_tokenized = tokenize_only(doc)
    vocab_tokenized.extend(words_tokenized)

processing 0 document
processing 1 document
processing 2 document
processing 3 document
processing 4 document
processing 5 document
processing 6 document
processing 7 document
processing 8 document
processing 9 document
processing 10 document
processing 11 document
processing 12 document
processing 13 document
processing 14 document
processing 15 document
processing 16 document
processing 17 document
processing 18 document
processing 19 document
processing 20 document
processing 21 document
processing 22 document
processing 23 document
processing 24 document
processing 25 document
processing 26 document
processing 27 document
processing 28 document
processing 29 document
processing 30 document
processing 31 document
processing 32 document
processing 33 document
processing 34 document
processing 35 document
processing 36 document
processing 37 document
processing 38 document
processing 39 document
processing 40 document
processing 41 document
processing 42 document
processing 43 documen

processing 686 document
processing 687 document
processing 688 document
processing 689 document
processing 690 document
processing 691 document
processing 692 document
processing 693 document
processing 694 document
processing 695 document
processing 696 document
processing 697 document
processing 698 document
processing 699 document
processing 700 document
processing 701 document
processing 702 document
processing 703 document
processing 704 document
processing 705 document
processing 706 document
processing 707 document
processing 708 document
processing 709 document
processing 710 document
processing 711 document
processing 712 document
processing 713 document
processing 714 document
processing 715 document
processing 716 document
processing 717 document
processing 718 document
processing 719 document
processing 720 document
processing 721 document
processing 722 document
processing 723 document
processing 724 document
processing 725 document
processing 726 document
processing 727 d

processing 1353 document
processing 1354 document
processing 1355 document
processing 1356 document
processing 1357 document
processing 1358 document
processing 1359 document
processing 1360 document
processing 1361 document
processing 1362 document
processing 1363 document
processing 1364 document
processing 1365 document
processing 1366 document
processing 1367 document
processing 1368 document
processing 1369 document
processing 1370 document
processing 1371 document
processing 1372 document
processing 1373 document
processing 1374 document
processing 1375 document
processing 1376 document
processing 1377 document
processing 1378 document
processing 1379 document
processing 1380 document
processing 1381 document
processing 1382 document
processing 1383 document
processing 1384 document
processing 1385 document
processing 1386 document
processing 1387 document
processing 1388 document
processing 1389 document
processing 1390 document
processing 1391 document
processing 1392 document


processing 2007 document
processing 2008 document
processing 2009 document
processing 2010 document
processing 2011 document
processing 2012 document
processing 2013 document
processing 2014 document
processing 2015 document
processing 2016 document
processing 2017 document
processing 2018 document
processing 2019 document
processing 2020 document
processing 2021 document
processing 2022 document
processing 2023 document
processing 2024 document
processing 2025 document
processing 2026 document
processing 2027 document
processing 2028 document
processing 2029 document
processing 2030 document
processing 2031 document
processing 2032 document
processing 2033 document
processing 2034 document
processing 2035 document
processing 2036 document
processing 2037 document
processing 2038 document
processing 2039 document
processing 2040 document
processing 2041 document
processing 2042 document
processing 2043 document
processing 2044 document
processing 2045 document
processing 2046 document


processing 2661 document
processing 2662 document
processing 2663 document
processing 2664 document
processing 2665 document
processing 2666 document
processing 2667 document
processing 2668 document
processing 2669 document
processing 2670 document
processing 2671 document
processing 2672 document
processing 2673 document
processing 2674 document
processing 2675 document
processing 2676 document
processing 2677 document
processing 2678 document
processing 2679 document
processing 2680 document
processing 2681 document
processing 2682 document
processing 2683 document
processing 2684 document
processing 2685 document
processing 2686 document
processing 2687 document
processing 2688 document
processing 2689 document
processing 2690 document
processing 2691 document
processing 2692 document
processing 2693 document
processing 2694 document
processing 2695 document
processing 2696 document
processing 2697 document
processing 2698 document
processing 2699 document
processing 2700 document


processing 3315 document
processing 3316 document
processing 3317 document
processing 3318 document
processing 3319 document
processing 3320 document
processing 3321 document
processing 3322 document
processing 3323 document
processing 3324 document
processing 3325 document
processing 3326 document
processing 3327 document
processing 3328 document
processing 3329 document
processing 3330 document
processing 3331 document
processing 3332 document
processing 3333 document
processing 3334 document
processing 3335 document
processing 3336 document
processing 3337 document
processing 3338 document
processing 3339 document
processing 3340 document
processing 3341 document
processing 3342 document
processing 3343 document
processing 3344 document
processing 3345 document
processing 3346 document
processing 3347 document
processing 3348 document
processing 3349 document
processing 3350 document
processing 3351 document
processing 3352 document
processing 3353 document
processing 3354 document


processing 3969 document
processing 3970 document
processing 3971 document
processing 3972 document
processing 3973 document
processing 3974 document
processing 3975 document
processing 3976 document
processing 3977 document
processing 3978 document
processing 3979 document
processing 3980 document
processing 3981 document
processing 3982 document
processing 3983 document
processing 3984 document
processing 3985 document
processing 3986 document
processing 3987 document
processing 3988 document
processing 3989 document
processing 3990 document
processing 3991 document
processing 3992 document
processing 3993 document
processing 3994 document
processing 3995 document
processing 3996 document
processing 3997 document
processing 3998 document
processing 3999 document
processing 4000 document
processing 4001 document
processing 4002 document
processing 4003 document
processing 4004 document
processing 4005 document
processing 4006 document
processing 4007 document
processing 4008 document


processing 4623 document
processing 4624 document
processing 4625 document
processing 4626 document
processing 4627 document
processing 4628 document
processing 4629 document
processing 4630 document
processing 4631 document
processing 4632 document
processing 4633 document
processing 4634 document
processing 4635 document
processing 4636 document
processing 4637 document
processing 4638 document
processing 4639 document
processing 4640 document
processing 4641 document
processing 4642 document
processing 4643 document
processing 4644 document
processing 4645 document
processing 4646 document
processing 4647 document
processing 4648 document
processing 4649 document
processing 4650 document
processing 4651 document
processing 4652 document
processing 4653 document
processing 4654 document
processing 4655 document
processing 4656 document
processing 4657 document
processing 4658 document
processing 4659 document
processing 4660 document
processing 4661 document
processing 4662 document


processing 5277 document
processing 5278 document
processing 5279 document
processing 5280 document
processing 5281 document
processing 5282 document
processing 5283 document
processing 5284 document
processing 5285 document
processing 5286 document
processing 5287 document
processing 5288 document
processing 5289 document
processing 5290 document
processing 5291 document
processing 5292 document
processing 5293 document
processing 5294 document
processing 5295 document
processing 5296 document
processing 5297 document
processing 5298 document
processing 5299 document
processing 5300 document
processing 5301 document
processing 5302 document
processing 5303 document
processing 5304 document
processing 5305 document
processing 5306 document
processing 5307 document
processing 5308 document
processing 5309 document
processing 5310 document
processing 5311 document
processing 5312 document
processing 5313 document
processing 5314 document
processing 5315 document
processing 5316 document


processing 5931 document
processing 5932 document
processing 5933 document
processing 5934 document
processing 5935 document
processing 5936 document
processing 5937 document
processing 5938 document
processing 5939 document
processing 5940 document
processing 5941 document
processing 5942 document
processing 5943 document
processing 5944 document
processing 5945 document
processing 5946 document
processing 5947 document
processing 5948 document
processing 5949 document
processing 5950 document
processing 5951 document
processing 5952 document
processing 5953 document
processing 5954 document
processing 5955 document
processing 5956 document
processing 5957 document
processing 5958 document
processing 5959 document
processing 5960 document
processing 5961 document
processing 5962 document
processing 5963 document
processing 5964 document
processing 5965 document
processing 5966 document
processing 5967 document
processing 5968 document
processing 5969 document
processing 5970 document


processing 6585 document
processing 6586 document
processing 6587 document
processing 6588 document
processing 6589 document
processing 6590 document
processing 6591 document
processing 6592 document
processing 6593 document
processing 6594 document
processing 6595 document
processing 6596 document
processing 6597 document
processing 6598 document
processing 6599 document
processing 6600 document
processing 6601 document
processing 6602 document
processing 6603 document
processing 6604 document
processing 6605 document
processing 6606 document
processing 6607 document
processing 6608 document
processing 6609 document
processing 6610 document
processing 6611 document
processing 6612 document
processing 6613 document
processing 6614 document
processing 6615 document
processing 6616 document
processing 6617 document
processing 6618 document
processing 6619 document
processing 6620 document
processing 6621 document
processing 6622 document
processing 6623 document
processing 6624 document


processing 7239 document
processing 7240 document
processing 7241 document
processing 7242 document
processing 7243 document
processing 7244 document
processing 7245 document
processing 7246 document
processing 7247 document
processing 7248 document
processing 7249 document
processing 7250 document
processing 7251 document
processing 7252 document
processing 7253 document
processing 7254 document
processing 7255 document
processing 7256 document
processing 7257 document
processing 7258 document
processing 7259 document
processing 7260 document
processing 7261 document
processing 7262 document
processing 7263 document
processing 7264 document
processing 7265 document
processing 7266 document
processing 7267 document
processing 7268 document
processing 7269 document
processing 7270 document
processing 7271 document
processing 7272 document
processing 7273 document
processing 7274 document
processing 7275 document
processing 7276 document
processing 7277 document
processing 7278 document


processing 7893 document
processing 7894 document
processing 7895 document
processing 7896 document
processing 7897 document
processing 7898 document
processing 7899 document
processing 7900 document
processing 7901 document
processing 7902 document
processing 7903 document
processing 7904 document
processing 7905 document
processing 7906 document
processing 7907 document
processing 7908 document
processing 7909 document
processing 7910 document
processing 7911 document
processing 7912 document
processing 7913 document
processing 7914 document
processing 7915 document
processing 7916 document
processing 7917 document
processing 7918 document
processing 7919 document
processing 7920 document
processing 7921 document
processing 7922 document
processing 7923 document
processing 7924 document
processing 7925 document
processing 7926 document
processing 7927 document
processing 7928 document
processing 7929 document
processing 7930 document
processing 7931 document
processing 7932 document


processing 8547 document
processing 8548 document
processing 8549 document
processing 8550 document
processing 8551 document
processing 8552 document
processing 8553 document
processing 8554 document
processing 8555 document
processing 8556 document
processing 8557 document
processing 8558 document
processing 8559 document
processing 8560 document
processing 8561 document
processing 8562 document
processing 8563 document
processing 8564 document
processing 8565 document
processing 8566 document
processing 8567 document
processing 8568 document
processing 8569 document
processing 8570 document
processing 8571 document
processing 8572 document
processing 8573 document
processing 8574 document
processing 8575 document
processing 8576 document
processing 8577 document
processing 8578 document
processing 8579 document
processing 8580 document
processing 8581 document
processing 8582 document
processing 8583 document
processing 8584 document
processing 8585 document
processing 8586 document


processing 9201 document
processing 9202 document
processing 9203 document
processing 9204 document
processing 9205 document
processing 9206 document
processing 9207 document
processing 9208 document
processing 9209 document
processing 9210 document
processing 9211 document
processing 9212 document
processing 9213 document
processing 9214 document
processing 9215 document
processing 9216 document
processing 9217 document
processing 9218 document
processing 9219 document
processing 9220 document
processing 9221 document
processing 9222 document
processing 9223 document
processing 9224 document
processing 9225 document
processing 9226 document
processing 9227 document
processing 9228 document
processing 9229 document
processing 9230 document
processing 9231 document
processing 9232 document
processing 9233 document
processing 9234 document
processing 9235 document
processing 9236 document
processing 9237 document
processing 9238 document
processing 9239 document
processing 9240 document


processing 9855 document
processing 9856 document
processing 9857 document
processing 9858 document
processing 9859 document
processing 9860 document
processing 9861 document
processing 9862 document
processing 9863 document
processing 9864 document
processing 9865 document
processing 9866 document
processing 9867 document
processing 9868 document
processing 9869 document
processing 9870 document
processing 9871 document
processing 9872 document
processing 9873 document
processing 9874 document
processing 9875 document
processing 9876 document
processing 9877 document
processing 9878 document
processing 9879 document
processing 9880 document
processing 9881 document
processing 9882 document
processing 9883 document
processing 9884 document
processing 9885 document
processing 9886 document
processing 9887 document
processing 9888 document
processing 9889 document
processing 9890 document
processing 9891 document
processing 9892 document
processing 9893 document
processing 9894 document


processing 10490 document
processing 10491 document
processing 10492 document
processing 10493 document
processing 10494 document
processing 10495 document
processing 10496 document
processing 10497 document
processing 10498 document
processing 10499 document
processing 10500 document
processing 10501 document
processing 10502 document
processing 10503 document
processing 10504 document
processing 10505 document
processing 10506 document
processing 10507 document
processing 10508 document
processing 10509 document
processing 10510 document
processing 10511 document
processing 10512 document
processing 10513 document
processing 10514 document
processing 10515 document
processing 10516 document
processing 10517 document
processing 10518 document
processing 10519 document
processing 10520 document
processing 10521 document
processing 10522 document
processing 10523 document
processing 10524 document
processing 10525 document
processing 10526 document
processing 10527 document
processing 1

processing 11120 document
processing 11121 document
processing 11122 document
processing 11123 document
processing 11124 document
processing 11125 document
processing 11126 document
processing 11127 document
processing 11128 document
processing 11129 document
processing 11130 document
processing 11131 document
processing 11132 document
processing 11133 document
processing 11134 document
processing 11135 document
processing 11136 document
processing 11137 document
processing 11138 document
processing 11139 document
processing 11140 document
processing 11141 document
processing 11142 document
processing 11143 document
processing 11144 document
processing 11145 document
processing 11146 document
processing 11147 document
processing 11148 document
processing 11149 document
processing 11150 document
processing 11151 document
processing 11152 document
processing 11153 document
processing 11154 document
processing 11155 document
processing 11156 document
processing 11157 document
processing 1

processing 11750 document
processing 11751 document
processing 11752 document
processing 11753 document
processing 11754 document
processing 11755 document
processing 11756 document
processing 11757 document
processing 11758 document
processing 11759 document
processing 11760 document
processing 11761 document
processing 11762 document
processing 11763 document
processing 11764 document
processing 11765 document
processing 11766 document
processing 11767 document
processing 11768 document
processing 11769 document
processing 11770 document
processing 11771 document
processing 11772 document
processing 11773 document
processing 11774 document
processing 11775 document
processing 11776 document
processing 11777 document
processing 11778 document
processing 11779 document
processing 11780 document
processing 11781 document
processing 11782 document
processing 11783 document
processing 11784 document
processing 11785 document
processing 11786 document
processing 11787 document
processing 1

processing 12380 document
processing 12381 document
processing 12382 document
processing 12383 document
processing 12384 document
processing 12385 document
processing 12386 document
processing 12387 document
processing 12388 document
processing 12389 document
processing 12390 document
processing 12391 document
processing 12392 document
processing 12393 document
processing 12394 document
processing 12395 document
processing 12396 document
processing 12397 document
processing 12398 document
processing 12399 document
processing 12400 document
processing 12401 document
processing 12402 document
processing 12403 document
processing 12404 document
processing 12405 document
processing 12406 document
processing 12407 document
processing 12408 document
processing 12409 document
processing 12410 document
processing 12411 document
processing 12412 document
processing 12413 document
processing 12414 document
processing 12415 document
processing 12416 document
processing 12417 document
processing 1

processing 13010 document
processing 13011 document
processing 13012 document
processing 13013 document
processing 13014 document
processing 13015 document
processing 13016 document
processing 13017 document
processing 13018 document
processing 13019 document
processing 13020 document
processing 13021 document
processing 13022 document
processing 13023 document
processing 13024 document
processing 13025 document
processing 13026 document
processing 13027 document
processing 13028 document
processing 13029 document
processing 13030 document
processing 13031 document
processing 13032 document
processing 13033 document
processing 13034 document
processing 13035 document
processing 13036 document
processing 13037 document
processing 13038 document
processing 13039 document
processing 13040 document
processing 13041 document
processing 13042 document
processing 13043 document
processing 13044 document
processing 13045 document
processing 13046 document
processing 13047 document
processing 1

processing 13640 document
processing 13641 document
processing 13642 document
processing 13643 document
processing 13644 document
processing 13645 document
processing 13646 document
processing 13647 document
processing 13648 document
processing 13649 document
processing 13650 document
processing 13651 document
processing 13652 document
processing 13653 document
processing 13654 document
processing 13655 document
processing 13656 document
processing 13657 document
processing 13658 document
processing 13659 document
processing 13660 document
processing 13661 document
processing 13662 document
processing 13663 document
processing 13664 document
processing 13665 document
processing 13666 document
processing 13667 document
processing 13668 document
processing 13669 document
processing 13670 document
processing 13671 document
processing 13672 document
processing 13673 document
processing 13674 document
processing 13675 document
processing 13676 document
processing 13677 document
processing 1

processing 14270 document
processing 14271 document
processing 14272 document
processing 14273 document
processing 14274 document
processing 14275 document
processing 14276 document
processing 14277 document
processing 14278 document
processing 14279 document
processing 14280 document
processing 14281 document
processing 14282 document
processing 14283 document
processing 14284 document
processing 14285 document
processing 14286 document
processing 14287 document
processing 14288 document
processing 14289 document
processing 14290 document
processing 14291 document
processing 14292 document
processing 14293 document
processing 14294 document
processing 14295 document
processing 14296 document
processing 14297 document
processing 14298 document
processing 14299 document
processing 14300 document
processing 14301 document
processing 14302 document
processing 14303 document
processing 14304 document
processing 14305 document
processing 14306 document
processing 14307 document
processing 1

processing 14900 document
processing 14901 document
processing 14902 document
processing 14903 document
processing 14904 document
processing 14905 document
processing 14906 document
processing 14907 document
processing 14908 document
processing 14909 document
processing 14910 document
processing 14911 document
processing 14912 document
processing 14913 document
processing 14914 document
processing 14915 document
processing 14916 document
processing 14917 document
processing 14918 document
processing 14919 document
processing 14920 document
processing 14921 document
processing 14922 document
processing 14923 document
processing 14924 document
processing 14925 document
processing 14926 document
processing 14927 document
processing 14928 document
processing 14929 document
processing 14930 document
processing 14931 document
processing 14932 document
processing 14933 document
processing 14934 document
processing 14935 document
processing 14936 document
processing 14937 document
processing 1

In [70]:
data['text']

['xref cantaloupe srv cs cmu edu alt atheism alt atheism moderated news answers alt answers path cantaloupe srv cs cmu edu crabapple srv cs cmu edu bb andrew cmu edu news sei cmu edu cis ohio state edu magnus acs ohio state edu usenet ins cwru edu agate spool mu edu uunet pipex ibmpcug mantis mathew from mathew mathew mantis co uk newsgroups alt atheism alt atheism moderated news answers alt answers subject alt atheism faq atheist resources summary books addresses music anything related to atheism keywords faq atheism books music fiction addresses contacts message id mantis co uk date mon mar gmt expires thu apr gmt followup to alt atheism distribution world organization mantis consultants cambridge uk approved news answers request mit edu supersedes mantis co uk lines archive name atheism resources alt atheism archive name resources last modified december version atheist resources addresses of atheist organizations usa freedom from religion foundation darwin fish bumper stickers and a

In [71]:
# save the model to disk
import pickle
filename = 'data.pkl'
pickle.dump(data, open(filename, 'wb'))

In [3]:
import pickle
data = pickle.load(open('data.pkl', 'rb'))

In [4]:
pd.DataFrame(data).head()

Unnamed: 0,index,text,target,word_count,clean_text_stemmed,clean_text_lemmatized,text_stemmed,text_lemmatized
0,0,xref cantaloupe srv cs cmu edu alt atheism alt...,0,1772,"[cantaloup, srv, cs, cmu, edu, alt, atheism, a...","[cantaloupe, srv, cs, cmu, edu, alt, atheism, ...","[xref, cantaloup, srv, cs, cmu, edu, alt, athe...","[xref, cantaloupe, srv, cs, cmu, edu, alt, ath..."
1,1,xref cantaloupe srv cs cmu edu alt atheism alt...,0,5425,"[cantaloup, srv, cs, cmu, edu, alt, atheism, a...","[cantaloupe, srv, cs, cmu, edu, alt, atheism, ...","[xref, cantaloup, srv, cs, cmu, edu, alt, athe...","[xref, cantaloupe, srv, cs, cmu, edu, alt, ath..."
2,2,newsgroups alt atheism path cantaloupe srv cs ...,0,806,"[newsgroup, alt, atheism, path, cantaloup, srv...","[newsgroup, alt, atheism, path, cantaloupe, sr...","[newsgroup, alt, atheism, path, cantaloup, srv...","[newsgroup, alt, atheism, path, cantaloupe, sr..."
3,3,xref cantaloupe srv cs cmu edu alt atheism alt...,0,325,"[cantaloup, srv, cs, cmu, edu, alt, atheism, a...","[cantaloupe, srv, cs, cmu, edu, alt, atheism, ...","[xref, cantaloup, srv, cs, cmu, edu, alt, athe...","[xref, cantaloupe, srv, cs, cmu, edu, alt, ath..."
4,4,xref cantaloupe srv cs cmu edu alt atheism soc...,0,206,"[cantaloup, srv, cs, cmu, edu, alt, atheism, s...","[cantaloupe, srv, cs, cmu, edu, alt, atheism, ...","[xref, cantaloup, srv, cs, cmu, edu, alt, athe...","[xref, cantaloupe, srv, cs, cmu, edu, alt, ath..."


In [81]:
data['clean_text_lemmatized']

['cantaloupe srv cs cmu edu alt atheism alt atheism moderate news answer alt answer path cantaloupe srv cs cmu edu crabapple srv cs cmu edu bb andrew cmu edu news sei cmu edu cis ohio state edu magnus acs ohio state edu usenet ins cwru edu agate spool mu edu uunet pipex ibmpcug mantis mathew mathew mathew mantis co uk newsgroups alt atheism alt atheism moderate news answer alt answer subject alt atheism faq atheist resource summary book address music relate atheism keyword faq atheism book music fiction address contact message would mantis co uk date mon mar gmt expire thu apr gmt followup alt atheism distribution world organization mantis consultants cambridge uk approve news answer request mit edu supersedes mantis co uk line archive atheism resource alt atheism archive resource modify december version atheist resource address atheist organization usa freedom religion foundation darwin fish bumper sticker assort atheist paraphernalia available freedom religion foundation write ffrf p

In [73]:
print("Data Type: ",type(data['text']))
print("Data Type: ",type(data['clean_text_stemmed']))

print("Length of data: ",len(data['text']))
print("Length of data: ",len(data['clean_text_stemmed']))

Data Type:  <class 'list'>
Data Type:  <class 'list'>
Length of data:  14996
Length of data:  14996


In [74]:
print(data['text'][1])
print("************************************************************")

print("\n clean_text_stemmed \n")
print(data['clean_text_stemmed'][1])

print("************************************************************")
print("\n clean_text_lemmatized \n")
print(data['clean_text_lemmatized'][1])

xref cantaloupe srv cs cmu edu alt atheism alt atheism moderated news answers alt answers path cantaloupe srv cs cmu edu crabapple srv cs cmu edu fs ece cmu edu europa eng gtefsd com howland reston ans net agate netsys ibmpcug mantis mathew from mathew mathew mantis co uk newsgroups alt atheism alt atheism moderated news answers alt answers subject alt atheism faq introduction to atheism summary please read this file before posting to alt atheism keywords faq atheism message id mantis co uk date mon apr gmt expires thu may gmt followup to alt atheism distribution world organization mantis consultants cambridge uk approved news answers request mit edu supersedes mantis co uk lines archive name atheism introduction alt atheism archive name introduction last modified april version begin pgp signed message an introduction to atheism by mathew mathew mantis co uk this article attempts to provide a general introduction to atheism whilst i have tried to be as neutral as possible regarding con


************************************************************

 clean_text_stemmed 

['cantaloup', 'srv', 'cs', 'cmu', 'edu', 'alt', 'atheism', 'alt', 'atheism', 'moder', 'news', 'answer', 'alt', 'answer', 'path', 'cantaloup', 'srv', 'cs', 'cmu', 'edu', 'crabappl', 'srv', 'cs', 'cmu', 'edu', 'fs', 'ece', 'cmu', 'edu', 'europa', 'eng', 'gtefsd', 'com', 'howland', 'reston', 'an', 'net', 'agat', 'netsi', 'ibmpcug', 'manti', 'mathew', 'mathew', 'mathew', 'manti', 'co', 'uk', 'newsgroup', 'alt', 'atheism', 'alt', 'atheism', 'moder', 'news', 'answer', 'alt', 'answer', 'subject', 'alt', 'atheism', 'faq', 'introduct', 'atheism', 'summari', 'read', 'file', 'post', 'alt', 'atheism', 'keyword', 'faq', 'atheism', 'messag', 'd', 'manti', 'co', 'uk', 'date', 'mon', 'apr', 'gmt', 'expir', 'thu', 'gmt', 'followup', 'alt', 'atheism', 'distribut', 'world', 'organ', 'manti', 'consult', 'cambridg', 'uk', 'approv', 'news', 'answer', 'request', 'mit', 'edu', 'supersed', 'manti', 'co', 'uk', 'line', 'archiv'

************************************************************

 clean_text_lemmatized 

['cantaloupe', 'srv', 'cs', 'cmu', 'edu', 'alt', 'atheism', 'alt', 'atheism', 'moderate', 'news', 'answer', 'alt', 'answer', 'path', 'cantaloupe', 'srv', 'cs', 'cmu', 'edu', 'crabapple', 'srv', 'cs', 'cmu', 'edu', 'fs', 'ece', 'cmu', 'edu', 'europa', 'eng', 'gtefsd', 'com', 'howland', 'reston', 'ans', 'net', 'agate', 'netsys', 'ibmpcug', 'mantis', 'mathew', 'mathew', 'mathew', 'mantis', 'co', 'uk', 'newsgroups', 'alt', 'atheism', 'alt', 'atheism', 'moderate', 'news', 'answer', 'alt', 'answer', 'subject', 'alt', 'atheism', 'faq', 'introduction', 'atheism', 'summary', 'read', 'file', 'post', 'alt', 'atheism', 'keyword', 'faq', 'atheism', 'message', 'would', 'mantis', 'co', 'uk', 'date', 'mon', 'apr', 'gmt', 'expire', 'thu', 'gmt', 'followup', 'alt', 'atheism', 'distribution', 'world', 'organization', 'mantis', 'consultants', 'cambridge', 'uk', 'approve', 'news', 'answer', 'request', 'mit', 'edu', 'supe

# TF-IDF

In [5]:
## tfidf vectorizer needs sentence and not token. Hence we need to combine all the tokens back to form a string

data['clean_text_stemmed'] = [' '.join(text) for text in data['clean_text_stemmed']]
data['clean_text_lemmatized'] = [' '.join(text) for text in data['clean_text_lemmatized']]
data['clean_text_lemmatized'][0]

'cantaloupe srv cs cmu edu alt atheism alt atheism moderate news answer alt answer path cantaloupe srv cs cmu edu crabapple srv cs cmu edu bb andrew cmu edu news sei cmu edu cis ohio state edu magnus acs ohio state edu usenet ins cwru edu agate spool mu edu uunet pipex ibmpcug mantis mathew mathew mathew mantis co uk newsgroups alt atheism alt atheism moderate news answer alt answer subject alt atheism faq atheist resource summary book address music relate atheism keyword faq atheism book music fiction address contact message would mantis co uk date mon mar gmt expire thu apr gmt followup alt atheism distribution world organization mantis consultants cambridge uk approve news answer request mit edu supersedes mantis co uk line archive atheism resource alt atheism archive resource modify december version atheist resource address atheist organization usa freedom religion foundation darwin fish bumper sticker assort atheist paraphernalia available freedom religion foundation write ffrf p 

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                 min_df=0.05,
                                 use_idf=True, ngram_range=(1,4))  
#got 65% val accuracy with max_df=0.9,min_df=0.2
#got 78% val accuracy with max_df=0.95,min_df=0.1
#got 94% val accuracy with max_df=0.95,min_df=0.05
#tfidf_vectorizer = TfidfVectorizer()    giving 1 lakh features
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text_lemmatized'])

print(tfidf_matrix.shape)

(14996, 819)


In [82]:
tfidf_matrix1=pd.DataFrame(tfidf_matrix.toarray(), columns= tfidf_vectorizer.get_feature_names()) # Array mapping from feature integer indices to feature name

In [83]:
tfidf_matrix1.head()

Unnamed: 0,able,ac,accept,access,account,acs,act,action,actually,add,...,write article,write article apr,wrong,wupost,year,yes,zaphod,zaphod mps,zaphod mps ohio,zaphod mps ohio state
0,0.0,0.0,0.0,0.0,0.02843,0.028609,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.008284,0.0,0.018419,0.0,0.0,0.0,0.057018,0.028518,0.023097,0.0,...,0.0,0.0,0.024539,0.0,0.012101,0.00823,0.0,0.0,0.0,0.0
2,0.0,0.0,0.094151,0.0,0.049156,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.098188,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.074678,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
terms = tfidf_vectorizer.get_feature_names()
print(type(terms))
terms[:5]

<class 'list'>


['actually', 'alt', 'andrew', 'andrew cmu', 'andrew cmu edu']

In [84]:
#Duplicating the original text extracted before proceeeding with preprocessing steps

import copy
train_data = copy.deepcopy(tfidf_matrix1)

In [85]:
train_data['target']=data['target']

In [86]:
train_data.head()

Unnamed: 0,able,ac,accept,access,account,acs,act,action,actually,add,...,write article apr,wrong,wupost,year,yes,zaphod,zaphod mps,zaphod mps ohio,zaphod mps ohio state,target
0,0.0,0.0,0.0,0.0,0.02843,0.028609,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.008284,0.0,0.018419,0.0,0.0,0.0,0.057018,0.028518,0.023097,0.0,...,0.0,0.024539,0.0,0.012101,0.00823,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.094151,0.0,0.049156,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.098188,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.074678,0.0,0.0,0.0,0.0,0.0,0.0,0


# Modelling 

In [87]:
train_data_new=train_data.sample(frac=0.8,random_state=200) #random state is a seed value
val_data=train_data.drop(train_data_new.index)

In [88]:
#Performing train val split on the data
X_train, y_train = train_data_new.loc[:,train_data_new.columns!='target'], train_data_new.loc[:,'target']

X_val, y_val = val_data.loc[:,val_data.columns!='target'], val_data.loc[:,'target']

# Ridge

In [22]:
from sklearn.linear_model import RidgeClassifierCV

In [23]:
#### TYPE
ridge_model=RidgeClassifierCV(fit_intercept=True, alphas=[
                       0.0125, 0.025, 0.05,.1, .125, .25, .5, 1., 2., 4.,10,100])

In [24]:
ridge_model.fit(X_train,y_train)

RidgeClassifierCV(alphas=array([1.25e-02, 2.50e-02, 5.00e-02, 1.00e-01, 1.25e-01, 2.50e-01,
       5.00e-01, 1.00e+00, 2.00e+00, 4.00e+00, 1.00e+01, 1.00e+02]),
                  class_weight=None, cv=None, fit_intercept=True,
                  normalize=False, scoring=None, store_cv_values=False)

In [26]:
train_pred=ridge_model.predict(X_train)
val_pred=ridge_model.predict(X_val)


In [28]:
val_pred.shape

(2999,)

In [29]:
train_pred.shape
train_pred

array([15, 19, 14, ..., 11,  5, 14], dtype=int64)

In [30]:
y_val.head()

2     0
3     0
5     0
17    0
18    0
Name: target, dtype: int64

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(y_train,train_pred))
print(classification_report(y_val,val_pred))

              precision    recall  f1-score   support

           0       0.58      0.65      0.62       730
           1       0.59      0.19      0.29       468
           2       0.68      0.76      0.72       570
           3       0.50      0.36      0.42       549
           4       0.52      0.42      0.47       530
           5       0.55      0.58      0.56       526
           6       0.82      0.68      0.74       337
           7       0.43      0.50      0.46       572
           8       0.51      0.50      0.50       610
           9       0.52      0.43      0.47       539
          10       0.53      0.56      0.54       589
          11       0.65      0.58      0.61       697
          12       0.61      0.29      0.40       564
          13       0.60      0.35      0.44       599
          14       0.49      0.82      0.61       627
          15       0.58      1.00      0.73       638
          16       0.58      0.49      0.53       707
          17       0.57    

# Lasso

In [33]:
from sklearn.linear_model import RidgeClassifierCV

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
lasso=LogisticRegression(penalty='l1', solver='liblinear')

In [38]:
lasso.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
train_pred=lasso.predict(X_train)
val_pred=lasso.predict(X_val)

In [93]:
from sklearn.metrics import classification_report

In [41]:
print(classification_report(y_train,train_pred))
print(classification_report(y_val,val_pred))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69       730
           1       0.52      0.37      0.44       468
           2       0.75      0.80      0.78       570
           3       0.48      0.43      0.46       549
           4       0.49      0.55      0.52       530
           5       0.60      0.64      0.62       526
           6       0.85      0.89      0.87       337
           7       0.54      0.42      0.47       572
           8       0.53      0.66      0.59       610
           9       0.58      0.42      0.49       539
          10       0.55      0.66      0.60       589
          11       0.65      0.68      0.67       697
          12       0.57      0.46      0.51       564
          13       0.61      0.54      0.57       599
          14       0.60      0.77      0.67       627
          15       0.95      0.99      0.97       638
          16       0.63      0.62      0.62       707
          17       0.70    

# Naive Bayes

In [43]:
from sklearn import model_selection, naive_bayes, svm

In [44]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train,y_train)

# predict the labels on train dataset
pred_train = Naive.predict(X_train)

# predict the labels on validation dataset
pred_test = Naive.predict(X_val)

In [46]:
print(classification_report(y_train,pred_train))
print(classification_report(y_val,pred_test))

              precision    recall  f1-score   support

           0       0.37      0.51      0.43       730
           1       0.70      0.09      0.15       468
           2       0.59      0.40      0.48       570
           3       0.16      0.35      0.22       549
           4       0.48      0.10      0.17       530
           5       0.40      0.38      0.39       526
           6       1.00      0.15      0.27       337
           7       0.45      0.17      0.25       572
           8       0.43      0.36      0.39       610
           9       0.28      0.46      0.35       539
          10       0.61      0.13      0.22       589
          11       0.43      0.48      0.45       697
          12       0.37      0.17      0.23       564
          13       0.42      0.28      0.33       599
          14       0.58      0.34      0.43       627
          15       0.46      0.74      0.56       638
          16       0.16      0.66      0.26       707
          17       0.62    

# SVM

In [89]:
from sklearn.svm import SVC

In [90]:
svc_model=SVC()
#svc_line.set_params(classifier__kernel='linear',classifier__C=1,classifier__random_state=123)



In [91]:
svc_model.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [92]:
train_pred=svc_model.predict(X_train)
val_pred=svc_model.predict(X_val)

In [79]:
from sklearn.metrics import classification_report

In [93]:
print(classification_report(y_train,train_pred))
print(classification_report(y_val,val_pred))

              precision    recall  f1-score   support

           0       0.98      0.81      0.88       730
           1       1.00      0.95      0.97       468
           2       0.98      1.00      0.99       570
           3       0.97      0.99      0.98       549
           4       0.99      0.99      0.99       530
           5       0.98      0.99      0.99       526
           6       0.99      0.97      0.98       337
           7       0.99      0.99      0.99       572
           8       1.00      1.00      1.00       610
           9       1.00      1.00      1.00       539
          10       1.00      1.00      1.00       589
          11       1.00      1.00      1.00       697
          12       0.99      0.99      0.99       564
          13       0.99      1.00      1.00       599
          14       1.00      1.00      1.00       627
          15       1.00      1.00      1.00       638
          16       0.94      0.98      0.96       707
          17       0.95    

# Random search CV

In [94]:
from sklearn.pipeline import Pipeline

In [95]:
clf_svc=Pipeline(steps=[('classifier',SVC())])

In [96]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [97]:

svc_param_random={'classifier__C':[0.001,0.01,0.1,1,10,100],'classifier__gamma':[0,0.0001,0.01,0.1,1,10,100],
               "classifier__kernel":['linear','rbf','poly']}

In [98]:
svc_random=RandomizedSearchCV(clf_svc,param_distributions=svc_param_random,cv=5)

In [99]:
%%time
svc_random.fit(X_train,y_train)

Wall time: 3h 13min 10s


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('classifier',
                                              SVC(C=1.0, break_ties=False,
                                                  cache_size=200,
                                                  class_weight=None, coef0=0.0,
                                                  decision_function_shape='ovr',
                                                  degree=3, gamma='scale',
                                                  kernel='rbf', max_iter=-1,
                                                  probability=False,
                                                  random_state=None,
                                                  shrinking=True, tol=0.001,
                                                  verbose=False))],
                                      verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=N

In [100]:
# save the model to disk
import pickle
filename = 'svc_random.pkl'
pickle.dump(svc_random, open(filename, 'wb'))

In [101]:
train_pred=svc_random.predict(X_train)
val_pred=svc_random.predict(X_val)

In [102]:
print(classification_report(y_train,train_pred))
print(classification_report(y_val,val_pred))

              precision    recall  f1-score   support

           0       0.93      0.90      0.91       730
           1       1.00      0.98      0.99       468
           2       0.99      1.00      1.00       570
           3       1.00      0.99      0.99       549
           4       0.99      1.00      1.00       530
           5       0.99      1.00      1.00       526
           6       0.99      0.98      0.99       337
           7       1.00      1.00      1.00       572
           8       1.00      1.00      1.00       610
           9       1.00      1.00      1.00       539
          10       1.00      1.00      1.00       589
          11       1.00      1.00      1.00       697
          12       0.99      1.00      0.99       564
          13       1.00      1.00      1.00       599
          14       1.00      1.00      1.00       627
          15       1.00      1.00      1.00       638
          16       0.96      0.97      0.96       707
          17       0.95    