In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
text= """US President Joe Biden says his administration is looking into what happened at a food distribution site in Gaza — where local health officials say more than 100 people were killed and hundreds more injured — and he admitted the incident is going to complicate negotiations in the region.

“We’re checking that out right now; there are two competing versions of what happened. I don’t have an answer yet,” the president told CNN’s Arlette Saenz at the White House on Thursday.

Asked by Saenz if he worried the deaths would complicate negotiations, he responded: “Oh, I know it will.”

But Biden still expressed optimism that a deal on the hostages and a potential ceasefire could be reached soon. """

In [3]:
text="""President-elect Joe Biden and his transition team are preparing for an early, all-out push to pass an ambitious new stimulus bill, while also drawing up plans for a flurry of executive actions aimed at delivering on campaign promises and undoing the Trump administration's efforts to undermine key government agencies.
Biden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises. At the same time, his team will take over the work of spearheading one of the most complicated, politically fraught mass vaccination campaigns in American history.
Biden's agenda for his first 100 days in office will, according to both those close to him and outside groups in contact with his top aides, center on two key avenues of action: the passage of a broad economic aid package and, where legislation is not necessary, a series of executive actions aimed at advancing his priorities. Containing the Covid-19 pandemic, launching an economic recovery and tackling racial inequality are his most urgent priorities, transition officials say."""

In [4]:
# this performs word tokenization -> this is used in simple models where each word is a feature
from nltk.tokenize import word_tokenize
nltk.download('punkt')

tokens = word_tokenize(text)
tokens[-30:]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


['actions',
 'aimed',
 'at',
 'advancing',
 'his',
 'priorities',
 '.',
 'Containing',
 'the',
 'Covid-19',
 'pandemic',
 ',',
 'launching',
 'an',
 'economic',
 'recovery',
 'and',
 'tackling',
 'racial',
 'inequality',
 'are',
 'his',
 'most',
 'urgent',
 'priorities',
 ',',
 'transition',
 'officials',
 'say',
 '.']

In [5]:
# we often remove punctuation after tokenization since punctuation is unlikely to be a good predictive feature
tokens = [word for word in tokens if word.isalnum()]
tokens[:15]

['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'are',
 'preparing',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pass',
 'an']

In [6]:
# Alternative
# this performs sentence tokenizations -> can be used if you want to treat each sentence as a "feature"
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

["President-elect Joe Biden and his transition team are preparing for an early, all-out push to pass an ambitious new stimulus bill, while also drawing up plans for a flurry of executive actions aimed at delivering on campaign promises and undoing the Trump administration's efforts to undermine key government agencies.",
 'Biden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises.',
 'At the same time, his team will take over the work of spearheading one of the most complicated, politically fraught mass vaccination campaigns in American history.',
 "Biden's agenda for his first 100 days in office will, according to both those close to him and outside groups in contact with his top aides, center on two key avenues of action: the passage of a broad economic aid package and, where legislation is not necessary, a series of executive actions aimed at advancing his priorities.",
 'Containing the Covid-19 pan

In [7]:
# Part of speech can be a useful feature in itself, but is also heavily used in making lemmatization and stemming more effective
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(tokens,lang='eng')[:15]
#explanation of all these codes can be found here: https://medium.com/@gianpaul.r/tokenization-and-parts-of-speech-pos-tagging-in-pythons-nltk-library-2d30f70af13b

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Joe', 'NNP'),
 ('Biden', 'NNP'),
 ('and', 'CC'),
 ('his', 'PRP$'),
 ('transition', 'NN'),
 ('team', 'NN'),
 ('are', 'VBP'),
 ('preparing', 'VBG'),
 ('for', 'IN'),
 ('an', 'DT'),
 ('early', 'JJ'),
 ('push', 'NN'),
 ('to', 'TO'),
 ('pass', 'VB'),
 ('an', 'DT')]

In [8]:
# stemming can be done as cleaning technique -> treats prefixes and suffixes.
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed = [ps.stem(w) for w in tokens]
stemmed[:15]

['joe',
 'biden',
 'and',
 'hi',
 'transit',
 'team',
 'are',
 'prepar',
 'for',
 'an',
 'earli',
 'push',
 'to',
 'pass',
 'an']

In [9]:
# lemmatization is a more context aware version of stemming, where we take the actual roots of individual words
# the problem is that such a dictionary may not exist for all languages and that it does not know what to do with new words
nltk.download('wordnet') # wordnet is the most well known lemmatizer for english
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
lemmatized[:15]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...


['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'are',
 'preparing',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pas',
 'an']

In [10]:
# lemmatization may still be a bit weak, mostly because the lemmatizer would like a bit more information about context to make decisions
display(lemmatizer.lemmatize("was"))
display(lemmatizer.lemmatize("was",wordnet.VERB))
display(lemmatizer.lemmatize("better"))
display(lemmatizer.lemmatize("better",wordnet.ADJ))
display(lemmatizer.lemmatize("canning"))
display(lemmatizer.lemmatize("canning",wordnet.NOUN))
display(lemmatizer.lemmatize("canning",wordnet.VERB))

'wa'

'be'

'better'

'good'

'canning'

'canning'

'can'

In [11]:
# let's apply this to the all the newsfeed
nltk.download('averaged_perceptron_tagger')

# unfortunately pos_tag and lemmatize use different codes for parts of speech
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # get returns second argument if first key does not exist

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in tokens]
lemmatized[:15]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['Joe',
 'Biden',
 'and',
 'his',
 'transition',
 'team',
 'be',
 'prepare',
 'for',
 'an',
 'early',
 'push',
 'to',
 'pas',
 'an']

In [12]:
#removal of stopwords allows us to reduce the noise in the data to focus on the signal
from nltk.corpus import stopwords
nltk.download('stopwords')

without_sw = [word for word in lemmatized if not word in stopwords.words()]
without_sw[:15]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\UTILISATEUR\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['Joe',
 'Biden',
 'transition',
 'team',
 'prepare',
 'early',
 'push',
 'ambitious',
 'stimulus',
 'bill',
 'draw',
 'plan',
 'flurry',
 'executive',
 'action']

In [13]:
" ".join(without_sw)

'Joe Biden transition team prepare early push ambitious stimulus bill draw plan flurry executive action aim deliver campaign promise undo Trump administration effort undermine key government agency Biden inaugurate January press mandate confront simultaneous interwoven public health economic racial crisis At time team work spearhead complicate politically fraught mass vaccination campaign American history Biden agenda 100 day office accord close group contact top aide center key avenue action passage broad economic aid package legislation series executive action aim advance priority Containing pandemic launch economic recovery tackle racial inequality urgent priority transition official'

In [14]:
text.split('.')[1]

'\nBiden will be inaugurated in January with a pressing mandate to confront simultaneous and interwoven public health, economic and racial crises'

In [15]:
without_sw

['Joe',
 'Biden',
 'transition',
 'team',
 'prepare',
 'early',
 'push',
 'ambitious',
 'stimulus',
 'bill',
 'draw',
 'plan',
 'flurry',
 'executive',
 'action',
 'aim',
 'deliver',
 'campaign',
 'promise',
 'undo',
 'Trump',
 'administration',
 'effort',
 'undermine',
 'key',
 'government',
 'agency',
 'Biden',
 'inaugurate',
 'January',
 'press',
 'mandate',
 'confront',
 'simultaneous',
 'interwoven',
 'public',
 'health',
 'economic',
 'racial',
 'crisis',
 'At',
 'time',
 'team',
 'work',
 'spearhead',
 'complicate',
 'politically',
 'fraught',
 'mass',
 'vaccination',
 'campaign',
 'American',
 'history',
 'Biden',
 'agenda',
 '100',
 'day',
 'office',
 'accord',
 'close',
 'group',
 'contact',
 'top',
 'aide',
 'center',
 'key',
 'avenue',
 'action',
 'passage',
 'broad',
 'economic',
 'aid',
 'package',
 'legislation',
 'series',
 'executive',
 'action',
 'aim',
 'advance',
 'priority',
 'Containing',
 'pandemic',
 'launch',
 'economic',
 'recovery',
 'tackle',
 'racial',
 'in

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vect = CountVectorizer()
# fit creates one entry for each different word seen
bow_vect.fit([" ".join(without_sw)])

In [17]:
set(without_sw)

{'100',
 'American',
 'At',
 'Biden',
 'Containing',
 'January',
 'Joe',
 'Trump',
 'accord',
 'action',
 'administration',
 'advance',
 'agency',
 'agenda',
 'aid',
 'aide',
 'aim',
 'ambitious',
 'avenue',
 'bill',
 'broad',
 'campaign',
 'center',
 'close',
 'complicate',
 'confront',
 'contact',
 'crisis',
 'day',
 'deliver',
 'draw',
 'early',
 'economic',
 'effort',
 'executive',
 'flurry',
 'fraught',
 'government',
 'group',
 'health',
 'history',
 'inaugurate',
 'inequality',
 'interwoven',
 'key',
 'launch',
 'legislation',
 'mandate',
 'mass',
 'office',
 'official',
 'package',
 'pandemic',
 'passage',
 'plan',
 'politically',
 'prepare',
 'press',
 'priority',
 'promise',
 'public',
 'push',
 'racial',
 'recovery',
 'series',
 'simultaneous',
 'spearhead',
 'stimulus',
 'tackle',
 'team',
 'time',
 'top',
 'transition',
 'undermine',
 'undo',
 'urgent',
 'vaccination',
 'work'}

In [18]:
bow_vect.transform(['Joe Biden transition team prepare early push ambitious stimulus bill draw plan flurry executive action aim deliver campaign promise undo Trump administration effort undermine key government agency']).toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0]], dtype=int64)

In [19]:
bow_vect.transform(['economic economic']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [20]:
bow_vect.transform(['Joe work ambitious ambitious ambitoud economic rabbit']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [21]:
bow_vect.transform(['100']).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [22]:
bow_vect.transform(['goncalo']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [23]:
#transform only considers the words that have been seen in fit
bow_vect.transform(['accord stimulus bill bill goncalo']).toarray()

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

# News clustering example

In [29]:
# corpus of 120k news headlines, here shortened to 10k

url = "https://raw.githubusercontent.com/GoncaloJardim/ironhack-v4-data-lessons/main/data/news.csv"


all_news = pd.read_csv(url)

all_news.head()

Unnamed: 0,news
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...
1,American Phil Mickelson registered a 59 to win...
2,French President Jacques Chirac and British Pr...
3,"As many as 15,000 New Zealanders will be force..."
4,The group led by al Qaeda ally Abu Musab al-Z...


In [30]:
all_news.shape

(10000, 1)

In [31]:
all_news.iloc[3]['news']

'As many as 15,000 New Zealanders will be forced to find an alternative form of pain relief after the worldwide recall of the drug Vioxx, which has been found to double the risk of heart attacks and strokes.'

In [32]:
# same process as before, but for all lines
#tokenize, lowercase, remove punctuation

def tokenizer_and_remove_punctuation(row):
  tokens = word_tokenize(row['news'])
  return [word.lower() for word in tokens if word.isalpha()]

all_news['tokenized'] = all_news.apply(tokenizer_and_remove_punctuation,axis=1)
all_news.head()

Unnamed: 0,news,tokenized
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu..."


In [33]:
# lemmatize with part of speech helpers

lemmatizer = WordNetLemmatizer()

def lemmatizer_with_pos(row):
  return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

all_news['lemmatized'] = all_news.apply(lemmatizer_with_pos,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m..."


In [34]:
# remove stopwords

def remove_sw(row):
  return list(set(row['lemmatized']).difference(stopwords.words()))

all_news['no_stopwords'] = all_news.apply(remove_sw,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized,no_stopwords
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof...","[thursday, profit, francisco, maker, boost, la..."
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w...","[champion, hawaii, score, putt, history, maste..."
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit...","[thursday, iraq, tony, evidence, chirac, briti..."
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ...","[relief, vioxx, attack, pain, risk, double, wo..."
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m...","[daylight, group, mosul, broad, abu, friday, l..."


In [35]:
# put all this cleaning together

def re_blob(row):
  return " ".join(row['no_stopwords'])

all_news['clean_blob'] = all_news.apply(re_blob,axis=1)
all_news.head()

Unnamed: 0,news,tokenized,lemmatized,no_stopwords,clean_blob
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,"[san, francisco, dell, said, thursday, its, pr...","[san, francisco, dell, say, thursday, it, prof...","[thursday, profit, francisco, maker, boost, la...",thursday profit francisco maker boost laptop r...
1,American Phil Mickelson registered a 59 to win...,"[american, phil, mickelson, registered, a, to,...","[american, phil, mickelson, register, a, to, w...","[champion, hawaii, score, putt, history, maste...",champion hawaii score putt history master tapp...
2,French President Jacques Chirac and British Pr...,"[french, president, jacques, chirac, and, brit...","[french, president, jacques, chirac, and, brit...","[thursday, iraq, tony, evidence, chirac, briti...",thursday iraq tony evidence chirac british cou...
3,"As many as 15,000 New Zealanders will be force...","[as, many, as, new, zealanders, will, be, forc...","[a, many, a, new, zealander, will, be, force, ...","[relief, vioxx, attack, pain, risk, double, wo...",relief vioxx attack pain risk double worldwide...
4,The group led by al Qaeda ally Abu Musab al-Z...,"[the, group, led, by, al, qaeda, ally, abu, mu...","[the, group, lead, by, al, qaeda, ally, abu, m...","[daylight, group, mosul, broad, abu, friday, l...",daylight group mosul broad abu friday lead sit...


In [36]:
#let's take only the most common 1000 words
bow_vect = CountVectorizer(max_features=1000)
# fit creates one entry for each different word seen
X = bow_vect.fit_transform(all_news['clean_blob']).toarray()

In [37]:
all_news['clean_blob'].iloc[0]

'thursday profit francisco maker boost laptop rise gear percent pc year san earlier'

In [38]:
as_df = pd.DataFrame(X,columns=bow_vect.get_feature_names_out())
as_df.head()

Unnamed: 0,abu,abuse,access,accord,account,accounting,accuse,acquire,acquisition,act,...,worth,wound,yahoo,yankee,yard,yasser,year,yesterday,york,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=6,random_state=100)
kmeans.fit(X)
pred = kmeans.predict(X)

In [40]:
predict_df = pd.concat([all_news['news'],pd.DataFrame(pred,columns=['class'])],axis=1)
predict_df.head()

Unnamed: 0,news,class
0,SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thurs...,5
1,American Phil Mickelson registered a 59 to win...,4
2,French President Jacques Chirac and British Pr...,3
3,"As many as 15,000 New Zealanders will be force...",4
4,The group led by al Qaeda ally Abu Musab al-Z...,4


In [41]:
pd.set_option('display.max_colwidth', None)

In [42]:
#sports
predict_df[predict_df['class']==0]

Unnamed: 0,news,class
14,Everton striker Wayne Rooney says he is quot;disappointed quot; with the way the club have handled his transfer request. Newcastle and Manchester United want to sign the 18-year-old who requested a transfer on Friday,0
41,"Barnstable had the upperhand once again. The Red Raiders are one win away from defending their MIAA Division 1 state volleyball championship after blanking Chelmsford, 3-0, in the semifinals last night in Mansfield. It was the second time this fall Chelmsford had been frustrated by the powerful Barnstable lineup.",0
69,European governments on Monday spoke out against a French proposal that the European Union restrict development aid to poorer member states that seek to lure foreign investment with low corporate tax rates.,0
87,PHILADELPHIA -- The burst was back. Displaying some of the giddy-up he showed against Ball State -- before suffering a left knee injury on his 21st carry of a 129-yard performance -- L.V. Whitworth returned to his season-opening form yesterday. The redshirt freshman tailback rushed 17 times for a career-high 151 yards and touchdowns of 29 and 39 yards to ...,0
95,The United Nations has warned its staff\in Thailand to be careful following threats by a separatist\group to stage attacks in Bangkok in revenge for the deaths of\85 Muslim protesters in the south last week.,0
...,...,...
9898,"New York -- On a rainy Wednesday at the US Open, Serena Williams got a phone call from Arlen Kantarian, chief executive of the United States Tennis Association.",0
9920,"The United States would deal with\Iran as part of a group of Iraq's neighbors that will meet in\Egypt this month, although Washington has no diplomatic\relations with Tehran, Secretary of State Colin Powell said on\Tuesday.",0
9984,"McDonald's Corp. &lt;A HREF=""http://www.investor.reuters.com/FullQuote.aspx?ticker=MCD.N target=/stocks/quickinfo/fullquote""&gt;MCD.N&lt;/A&gt; said Monday that sales at its namesake hamburger restaurants open at least 13 months rose 6.1 percent in October, helped by a popular Monopoly game promotion in the United States.",0
9995,A business columnist at the Seattle Times in the United States has resigned after admitting he copied the work of other journalists.,0


In [43]:
#financial
predict_df[predict_df['class']==1]

Unnamed: 0,news,class
23,"Indonesian police on Saturday released security camera images of a truck bombing outside the Australian Embassy, and investigators found traces of explosives in a room rented by two Malaysian militants wanted in the blast. Also Saturday, around 1,000 members of a hardline Muslim group rallied in downtown Jakarta against Thursday's attack, which killed nine people, two of them suspected suicide bombers...",1
31,"JERUSALEM: Syria is directly involved in terrorism and will not be granted immunity by Israel, the deputy defence minister said on Monday, but stopped short of claiming formal responsibility for the killing of a Hamas leader in Damascus.",1
53,"Turkish television stations broadcast a video Friday that claimed that Habib Akdas, suspected leader of the Turkish al-Qaida cell blamed for November suicide bombings in Istanbul, was killed this week in a US bombing raid in Iraq.",1
115,"India News: Srinagar, Dec 5 : Twelve persons, including ten army troopers, were killed in a landmine explosion around midnight Saturday night in south Kashmir #39;s Pulwama district.",1
116,"US-LED forces attacked two Iraqi rebel strongholds yesterday, killing nearly two dozen insurgents in a town near the Syrian border and bombing targets in Fallujah for a third day.",1
...,...,...
9879,Hezbollah sent a reconnaissance drone into Israeli territory over northern Jewish settlements Sunday in the first hostile aerial incursion from Lebanon since a hang glider attack 17 years ago killed six soldiers.,1
9931,Iraqi environment minister Mishkat\Moumin said she survived a suicide car bomb attack in Baghdad\on Tuesday that killed four of her bodyguards.,1
9946,": Suspected Muslim insurgents attacked an army unit protecting Buddhist monks at a monastery early Friday, killing one of the soldiers as sectarian violence continued in southern Thailand, police said.",1
9949,Several workers are believed to have been killed and others injured after a contruction site collapsed at Dubai airport. The workers were trapped under rubble at the site of a \$4.,1


In [44]:
#political news
predict_df[predict_df['class']==3]

Unnamed: 0,news,class
2,"French President Jacques Chirac and British Prime Minister Tony Blair maintained Thursday that relations between their countries were not strained by their disagreements over the Iraq war, as evidenced by their cooperation on a number of international",3
18,"French Finance Minister Nicolas Sarkozy is resigning when he meets today with Prime Minister Jean- Pierre Raffarin, freeing him to use his new position as head of the ruling party to prepare for the 2007 presidential elections.",3
62,Malaysia #39;s Prime Minister Abdullah Ahmad Badawi denies he struck a deal with Anwar Ibrahim before the rebel politician was freed last week after nearly six years in jail.,3
80,PRIME Minister John Howard yesterday said his personal staff witnessed and backed his version of a sensitive phone call with former ministerial adviser Mike Scrafton over the children overboard affair.,3
131,"French Interior Minister Dominique deVillepin announced Friday that France is to reinforce security in sensible places, especially around embassies after the explosion near the Indonesian embassy in Paris.",3
...,...,...
9849,German police Friday arrested three Iraqis suspected of hastily putting together a failed plot to assassinate interim Iraqi Prime Minister Ayad Allawi during his visit for talks with Chancellor Gerhard Schroeder.,3
9852,"Britain will provide more debt relief for the world #39;s poorest countries and challenge other rich governments to do the same, Prime Minister Tony Blair #39;s government said Sunday.",3
9919,"JERUSALEM : Beleaguered Israeli Prime Minister Ariel Sharon persisted with plans to form a new government including the main opposition Labour party, bolstered by Washington #39;s backing for the expansion of West Bank settlements.",3
9943,"BANGKOK: Thai Prime Minister Thaksin Shinawatra fired ministers responsible for dealing with a bird flu crisis and violence in the largely Muslim south yesterday, just months ahead of a general election.",3


In [45]:
#global sports
predict_df[predict_df['class']==5]

Unnamed: 0,news,class
0,"SAN FRANCISCO (CBS.MW) -- Dell Inc. said Thursday its third-quarter profit rose 25 percent from a year earlier as the No. 1 personal-computer maker boosted sales of its PCs, laptops and other gear by 18 percent.",5
12,"Ballistic missile defence, a mere political hiccup south of the border even in an American presidential year, is showing surprising strength as a topic of public debate in Canada.",5
50,"Investment by businesses in foreign markets fell 18 percent in 2003 to \$560 billion as the global economy continued to struggle, but should improve this year as growth speeds up, the United Nations said Wednesday.",5
57,"Arsenal #39;s Thierry Henry, AC Milan #39;s Andriy Shevchenko, and Barcelona #39;s Ronaldinho are the three finalists for FIFA #39;s 2004 World Player of the Year award.",5
63,"Avon Products Inc., the world #39;s largest direct seller of cosmetics, reported its first US sales decline in five years, sending the company #39;s shares down the most since 2000.",5
...,...,...
9975,"Worldwide sales of computer chips are expected to set a record this year, according to the San Jose, Calif.-based trade group Semiconductor Industry Association.",5
9983,"TiVo Inc., maker of digital\television video recorders, will next year add ways for viewers\to see advertising and corporate logos even as they try to skip\commercials, the company said on Wednesday.",5
9987,"Semiconductor Manufacturing International Corp., a leading Chinese computer chip maker, plans to begin supplying chips using more advanced technology to major clients such as Texas Instruments Inc. beginning next year, the companies said Friday.",5
9988,The militant group is poised to sponsor candidates in legislative and municipal elections expected later next year.,5
