**Developing Text Classifier (Removing Correlated Features)**

In [None]:
#import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import scipy
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import seaborn as sns


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


**remove the stopwords**

In [None]:
#remove the stopwords
stop_words=stopwords.words('english')
stop_words=stop_words+list(string.printable)
lemmatizer=WordNetLemmatizer()


**specify the categories of the article**

In [None]:
#specify the categories of the article
categories=['misc.forsale','sci.electronics','talk.religion.misc']


**News data and corresponding categories in a Pandas dataframe**

In [None]:
#News data and corresponding categories in a Pandas dataframe
news_data=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42,download_if_missing=True)
news_data_df=pd.DataFrame({'text':news_data['data'],'category':news_data.target})
news_data_df.head()


Unnamed: 0,text,category
0,From: Steve@Busop.cit.wayne.edu (Steve Teolis)...,0
1,From: jks2x@holmes.acc.Virginia.EDU (Jason K. ...,0
2,From: wayne@uva386.schools.virginia.edu (Tony ...,1
3,From: lihan@ccwf.cc.utexas.edu (Bruce G. Bostw...,1
4,From: myoakam@cis.ohio-state.edu (micah r yoak...,0


**cleaning the text such as tokenization, lemmatization etc**

In [None]:
#cleaning the text such as tokenization, lemmatization etc
news_data_df['cleaned_text']=news_data_df['text'].apply(lambda x :''.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+','',str(x))) if word.lower() not in stop_words]))


**create a TF-IDF matrix**

In [None]:
#create a TF-IDF matrix
tfidf_model=TfidfVectorizer(max_features=20)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(news_data_df['cleaned_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()


Unnamed: 0,02106raveludeledusamuelrosssubjectbooksalecheapnntppostinghostraveludeleduorganizationuniversitydelawaredistributionusaline28someonepleasebuybookaskingmuchmakeofferprobablytakecalculusanalyticgeometryauthursimoncopyrightdate1982avgconditionstillreadablewritinggoodsoftwarefortrangrahamsmithholthandbookkirsznermandellcopyright1986720pagewritingguidealgebratrigonometryproblemsolvingapproach3rdeditionflemmingvarberggoodconditiongeneralchemistryprinciplemodernapplicationpetruccifourtheditionbigbookgoodconditionsolutionmanualchemistrybookpaperbackstudyguidechemistrybookpaperbacksendofferviaemail02106chopinudeledusam02106chopinudeledu,er1eridanchuvashiasuyarabayevaalbinanikolayevnasubjectsalehighgualityconiferoilrussia450ton400tonreplytoer1eridanchuvashiasudistributioneunetorganizationfirmeridanlineinguiryaddresser1eridanchuvashiasu,jlleeacsubuffaloedujohnnyleesubjectmovingsalesummarymovingsaleorganizationubline44nntppostinghostlictoracsubuffaloedureducedpricelistthingforsalebehalfbrotherwhomovingmovedalreadyofferblackdeckerdusterplusportablehandvaccumpurchased3212sr1000dualcassetteportableplayeramfm5bandgraphicequalizerhighspeeddubingduotapetapedeckseemslosttreblesoundbetfixablepurchased80253monoluxzoommicroscope1200xmagnificationmadejapanincludescaseaccessorypurchased50204sunbeam1400hairdryerdryerputheadunderintoknowoneseesalondontaskbropurchased60245everylastspeedbagleatherbrandnewneverused106osterizerpuslematicblender10speedcookbookyearold10purchased508binoluxbinoculars7x35extrawideangle525ft1000ydscasenew209proctorsilexspraysteamdryironnew10questioncontactthruemailreplyexpeditouslyalwaysshincludedpleaseconsiderlastlyimreasonableveryreasonablethanksjohn,paynecrldeccomandrewpaynesubjectwantedtcm3105chipsmallquantityorganizationdeccambridgeresearchlabline16anyoneknowsourcetcm3105modemchipusedbaycompmpmodemideallysomethinggearedtowardhobbyistsmallquantitymailorderetcyearwevebuyingdistributormarshallhundredpmpkitorderdroppedpointlongeraffordofferservicedistributorivecheckedcrazyminimumorder100idlikefindsourcestillinterestedbuildingpmpkitsuggestionandrewpaynedeccambridgeresearchlab,pchangicsunysbedupongchangsubjectcomputerpartscamcordernntppostinghostlibws4icsunysbeduorganizationstateuniversitynewyorkstonybrookline21articlec4z2cf2n6cscnscomzardoxcscnscomrandieonealwrites52512mb35144drivesnewplannedbuildingmachineranfund3000drivecarlbranddrivedocumentationillgo4000shippingdocumentationpay40floppydrive40newalsoneeddocumentationfloppyinstallationthingidiotproofadvicecommonpchangicsunysbedusensestateuniversitynewyorkstonybrookengineer,pchangicsunysbedupongchangsubjectmicrosoftdo60upgradesalenntppostinghostlibws4icsunysbeduorganizationstateuniversitynewyorkstonybrookline36articlehatton733706165cglucsfeduhattonsocratesucsfedutomhattonwritesadn6285ritvaxiscriteduwritesarticle1pctnfinn6dpeveusceduyuanchieeveusceduyuanchiehhsuwritesmdo60upgradesalebestoffer45openedunregisteredanyonecareenlightenuwhetherdos60worthupgradinggoodcompressionturnedonoffnicenastyfeatureaccordingreportdontdoyetdontutilityqemmstackerpctoolsnortondos6mayworthpeopledos5sortutilitydos6doesntoffermuchyoudneverknowusualhypemarketingablecreatehoweverinstalleddolastweeknothingtroubleafterwardswindowappshittingprotectionfaultkidpinataseemlikeworkndosnortondo70probsincludesetpcplusdpcplusprocommpluslongerworkmanylittleutilitywrittendolongerworkeithermostlysharewareuninstalleddodoworkfineappsdoablerundowontcommonpchangicsunysbedusensestateuniversitynewyorkstonybrookengineer,pchangicsunysbedupongchangsubjectsalec128systemwprinter130obonntppostinghostlibws4icsunysbeduorganizationstateuniversitynewyorkstonybrookline20commodore128epsonhomewriter10pinprinter1571ddiskdrivejoystickmouselotsasoftwaregameappsrapidfirejoystickadapteryearold130obocommonpchangicsunysbedusensestateuniversitynewyorkstonybrookengineer,pchangicsunysbedupongchangsubjectsalequicken30pckeywordsaccountingcheckingquickennntppostinghostlibws4icsunysbeduorganizationstateuniversitynewyorkstonybrookline18article1pma84hpksuntanecusfedubonehamsunburnecusfedukevinbonehamchwritesarticle1pgvp1inn5ejphaktuscedukhohusceduolivermuotowritessalequicken30pcversionallowsbalanceassumegetnewreleaseearlierlastsawversion20latestprobablyreferringdoversiondoversionlikeversionthinkwindowversioncamerecentlylikeversionsomethingcommonpchangicsunysbedusensestateuniversitynewyorkstonybrookengineer,pchangicsunysbedupongchangsubjectvideotitlemakersalenntppostinghostlibws4icsunysbeduorganizationstateuniversitynewyorkstonybrookline37videonicstitlemakersystemmonthesoldusedincludescharactergeneratormodeltm1rez720x4808000availablechar12fontstereosoundmilliondifferentcoloravailable20specialeffectfullkeyboarddesignmaildetailthumsvideoeditormodeltu1markdiffernentsectiontapethumbthumbskipbadparttapebuiltinvideoenhancercopyingtapeviewingautomaticfaderswitchableusecombinationunitunitexcellentconditioncomedocunregisteredwarrantycardjrmusicworldsell399229respectivelyasking500unitemailpchangicsunysbeduinterestedcommonpchangicsunysbedusensestateuniversitynewyorkstonybrookengineer,pcwoodastroocistempleedupaulwoodsubjectforsalegenesisgameorganizationtempleuniversityline20nntppostinghostastroocistempleeduxnewsreadertinversion11pl8kelvinwilliamskwilllunatixuucpwrotegameforsaletradesonichedgehogiitwocopymanualcase25brandnewhellointerestedsoniciisendaddressgetmailbouncedbackhostunknowerrorpleasereplykwilllunatixuucpsubjectgamepaulwoodpcwoodastroocistempleedu,peavlerfingalplkafmilninjagourmetsubjectscarlethorsebabylondaemonorganizationuniversitynewmexicoalbuquerquenmline20distributionworldnntppostinghostfingalplkafmilkeywordsdeadhorsehorsebabylonarticle1qilgninnrkolynxunmedublowfishleounmeduronwriteseasy667neighborbeast666beastlifeendculdesacnoticeddeadhorsekeywordslinefamousscarlethorsebabylonbeastthats666illuminattiridewonderfulmediaevalmanuscriptfearannouncementoldgirldeadmayprematurebet20place6thracedownlastsundayslidbadfifthdeadcomatoselikegodwaysupposeninjagourmetfightfoodjimpeavleropinionexistpeavlerplkafmilcalledalbuquerquenmopinion,pegasusaaauoregonedupegasussubjectmerlinmithramagickorganizationpolyhedrongroupline21distributionworldnntppostinghostfp1dialin1uoregoneduarticlejoshua93apr19183833baileycpacwashingtonedujoshuacpacwashingtonedujoshuagellerwrotearticlepegasus150493132018fp1dialin4uoregonedupegasusaaauoregonedulaurieewbrandtwriteslewbletaddpercentage1315orphaicdocteriansbroughtlewbtogrouppaulsaulhighrankinginitiatelewbdevelopmentorphaicmysteryseejaneharrisonprolegomenonlewbstudygreekreligioncambridgepress1922easlydrawlewbyourconclusionjoshperhapsquotebitargumentlovetobutmustbitlatercopyharrisonpackedlastchapterbestremberdealorphicmysteryviewwomanthoughcomesaystronglyimplyedchristianviewdrawnheavlyorphicmajorculttimepegasus,pepkediracscrifsueduericpepkesubjectsocietalbasismoralityorganizationfloridastateuniversitydontspeakline13articlemerlyn735422443digibdmerlyndigibddigibdcommerlynleroywritesprayerschoollegalillegaltellingchildpraypraymanypeopleconfusecanttellkidoughtpraykidarentallowedpraypossiblykidwithouttoldperhapsthinkgovernmentalbodybusinesssuppressingbeliefelsetheyresortsatanichumanistconspiracyoldyoureuyoureubitemp,peterminsaneapanaorgaupetertryndochsubjectdmmadviceneededline28allmartinemdedmmadviceneededmefrommce5921bcsteccaboeingcommartinemdemeorganizationboeingmeicurrelymarketdmmrecentlysawaddmeforkelvin94199anyoneonemeotherbrandextremelyhappymesmallnamebrandcompareflukebeckmanbrandmeiwillingspend200onemeanyhelpgreatlyappreciatedpleaseemailmemartingoinguseonecountegaviationspacescuttleetcsuggestgobuyflukeneverseenbeckmanhowevereveryusebuycheapiemetexmadenameseendmmbrandnameboughtyragoaus12500convertuseedefinetlycheapiefarprovedaccuratetakenmoderateabusemanyfeaturecapfreqtransistorchecketchappywoulddefinetlybuyflukenamehopehelpcheerpeter,peterminsaneapanaorgaupetertryndochsubjectswrmetercbradioline28allthedevilreincarnateswrmetercbradiotdfromssaveolecdaccomdevilreincarnatetdorganizationcdacwatdwhattdisgoodchoicecb1418wavetdreadinstallationinstruction14waveantennatdandsuggesteduseswrtunechannel12tdandchannel32minimumreadingquestionchanneltd1232bestantennaoneletwaveprobablybestexplanationrestmakesenseonewavecancellbtwbeastiebest12waveantennafollowed1418etcswringactuallytrimantennacorrectlengthspecificwavelengthtransmittingsincewavelengthvarieschanneluserecommendedswrusingmiddlechannelgoinguseanywaybeginningcbsnewantennaswrednowdaysmanufacturetrimantennaalmostspottheremuchpointswringmayfanaticwhishanywaycheerpeter,peterminsaneapanaorgaupetertryndochsubjecttelephonehookoffhokline17allmichaelcovingtontelephonehookoffhokmcfrommcovingtaisun3aiugaedumichaelcovingtonmcorganizationaiprogramuniversitygeorgiaathensmcayethererubdrawenoughcurrentlightledthmcequipmentphonecompanythinkyouvegonehookmcinonhookstateyouresupposeddrawcurrentokletcalculationgoingaustralianstandardpresumemightsimilarcountryletinputphoneus600ohmloop48vline80mastandardleddrain20maactualloopcurrentrequiredhookindicationknowcheerpeter,peterminsaneapanaorgaupetertryndochsubjectwhatsexactlyflourline32allmartinmccormickwhatsexactlyflourmmfrommartindatacommuccokstateedumartinmccormickmmorganizationoklahomastateuniversitystillwaterokmmsortlamplittleglassbulbfoundinsidestartermmitsortremindsne2neonlampstarterappearopenmmwithohmmeterlittlelampeitherneoncapacitmminseriesmmmmseenthinglifeneverreadreallygoodmmdescriptionhappeninginsidelittleknowbimetallicstripcasetwostripdifferentmetalbondedtogetherheatedbendonesidecheckblinkerglobechristmastreelightturnpowercausebulbworklikeneonheatingshortingthusprovidinglooppowerheatermaintubetubefireinsufficientcurrentrunstarterkeepheatbimetalicstripstraightensocbtwthoughtnothingsmallneononedayneonsisterdigitalflipmetalsquaretypeclockbrokeflimsyleadreplacedonestarterwellpoweringmadebitmessclockcheerpeter,peterminsaneapanaorgaupetertryndochsubjectwhatsgoodicrs23line13alltallcoolonewhatsgoodicrs23tcfromrky57514uxacsouiucedutallcoolonetcorganizationuniversityillinoisurbanatcimlookingicconvertrs232voltagelevelttlvotclevelssomethingrelativelyinexpensivewouldniceanyonetcasuggestionthankstrymaximmax232cpepindilconvert5v12v232commmscleverlittlegizmopeter,petertoddchanpc1oandrewcmuedusubjectklipschfortespkrssaleorganizationfifthyrseniorelectricalcomputerengineeringcarnegiemellonpittsburghpaline14nntppostinghostpo3andrewcmueduitemklipschfortespeakerconditionmintagemontholdprice1000pairretail1400pairspeakerperfectconditionusedaudiophilesystemfloorstandingcomeoriginalpackagaingliteraturealsostillwarrantyinterestedquestionpleasefeelfreeemailpc1oandrewcmueducallhomethanksjon4128826425,petertoddchanpc1oandrewcmuedusubjectreducedsonycdplayersaleorganizationfifthyrseniorelectricalcomputerengineeringcarnegiemellonpittsburghpaline21nntppostinghostpo5andrewcmueduitemsonyescdpx229conditionexcellentageyearoldprice300includestoslinkitemsonycdp770conditionexcellentage25yearoldprice250everythingcomeoriginalpackagingmanualitemplayedaudiophilesystemexcellentshapeinterestedneedadditionalinformationpleaseemailpc1oandrewcmueducallhomethanksjon4128826425pyessale
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**calculate the correlation matrix for TF-IDF matrix**

In [None]:
#calculate the correlation matrix for TF-IDF matrix
correlation_matrix=tfidf_df.corr()
correlation_matrix.head()



**plot the correlation matrix using seaborn’s heatmap function**

In [None]:
#plot the correlation matrix using seaborn’s heatmap function
fig, ax=plt.subplots(figsize=(20,20))
sns.heatmap(correlation_matrix,annot=True)




**identify the pair of terms with high correlation, we created an upper triangular matrix from the correlation matrix.**

In [None]:
#identify the pair of terms with high correlation, we created an upper triangular matrix from the correlation matrix.
correlation_matrix_ut=correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape)).astype(np.bool))
correlation_matrix_melted=correlation_matrix_ut.stack().reset_index()
correlation_matrix_melted.columns=['word1','word2','correlation']
correlation_matrix_melted[(correlation_matrix_melted['word1']!=correlation_matrix_melted['word2'])&(correlation_matrix_melted['correlation']>.7)]


Unnamed: 0,word1,word2,correlation


**we will remove terms for which the coefficient of correlation is >0.7 and create a separate Dataframe with the remaining terms.**

In [None]:
#we will remove terms for which the coefficient of correlation is >0.7 and create a separate Dataframe with the remaining terms.
tfidf_tf_without_correlated_word=tfidf_df.drop(['02'],axis=1)
tfidf_tf_without_correlated_word.head()


**Developing Text Classifier (Dimensionality Reduction using PCA)**

In [None]:
#import necessary libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import scipy
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import seaborn as sns


**remove the stopwords**

In [None]:
#remove the stopwords
stop_words=stopwords.words('english')
stop_words=stop_words+list(string.printable)
lemmatizer=WordNetLemmatizer()
stop_words

**specify the categories of the article**

In [None]:
#specify the categories of the article
categories=['misc.forsale','sci.electronics','talk.religion.misc']


**News data and corresponding categories in a Pandas dataframe**

In [None]:
#News data and corresponding categories in a Pandas dataframe
news_data=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42,download_if_missing=True)
news_data_df=pd.DataFrame({'text':news_data['data'],'category':news_data.target})
news_data_df.head()


**cleaning the text such as tokenization, lemmatization etc**

In [None]:
#cleaning the text such as tokenization, lemmatization etc
news_data_df['cleaned_text']=news_data_df['text'].apply(lambda x :''.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+','',str(x))) if word.lower() not in stop_words]))
news_data_df['cleaned_text']


**create a TF-IDF matrix**

In [None]:
#create a TF-IDF matrix
tfidf_model=TfidfVectorizer(max_features=20)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(news_data_df['cleaned_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()


**sklearn’s PCA function to extract two principal components from the earlier data.**

In [None]:
#sklearn’s PCA function to extract two principal components from the earlier data.
from sklearn.decomposition import PCA
pca=PCA(2)
pca.fit(tfidf_df)
reduced_tfidf=pca.transform(tfidf_df)
reduced_tfidf


**create a scatter plot along these principal components and represent each category with a separate color.**

In [None]:
#create a scatter plot along these principal components and represent each category with a separate color.
plt.scatter(reduced_tfidf[:,0],reduced_tfidf[:,1],c=news_data_df['category'],cmap='viridis')
plt.xlabel('dimension_1')
plt.ylabel('dimension_2')
plt.title('Representation of NEWS documents in 2D')
plt.show()


**Saving and Loading Models**

In [None]:
#Saving and Loading Models

#import necessary packages

import pickle
from joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer


**defining a corpus consisting of four sentences**

In [None]:
#defining a corpus consisting of four sentences
corpus = ['Data Science is an overlap between Arts and Science', 'Generally, Arts graduates are right brained and Science graduates are left-brained','Excelling in both Arts and Science at a time becomes difficult','Natural Language Processing is a part of Data Science']


**fit a tf-idf model**

In [None]:
#fit a tf-idf model
tfidf_model=TfidfVectorizer()
print(tfidf_model.fit_transform(corpus).todense())


**save this tf-idf model on disk using joblib.**

In [None]:
#save this tf-idf model on disk using joblib.
dump(tfidf_model,'tfidf_model.joblib')


['tfidf_model.joblib']

**load this model from the disk to the memory and use it**

In [None]:
#load this model from the disk to the memory and use it
tfidf_model_loaded=load('tfidf_model.joblib')
print(tfidf_model_loaded.fit_transform(corpus).todense())


**save this tf-idf model on disk using pickle**

In [None]:
#save this tf-idf model on disk using pickle
pickle.dump(tfidf_model,open("tfidf_model.pickle.dat","wb"))


**load this model from the disk to the memory and use it**

In [None]:
#load this model from the disk to the memory and use it
loaded_model=pickle.load(open("tfidf_model.pickle.dat","rb"))

print(loaded_model.fit_transform(corpus).todense())
