In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
pd.set_option('max_colwidth',150)

In [None]:
df = pd.read_csv("data/preprocessed/raw.csv")
df.head()

Unnamed: 0,category,title,body
0,business,Ad sales boost Time Warner profit\n,"\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.\n \n T..."
1,business,Dollar gains on Greenspan speech\n,\n The dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to...
2,business,Yukos unit buyer faces loan claim\n,\n The owners of embattled Russian oil giant Yukos are to ask the buyer of its former production unit to pay back a $900m (Â£479m) loan.\n \n Stat...
3,business,High fuel prices hit BA's profits\n,"\n British Airways has blamed high fuel prices for a 40% drop in profits.\n \n Reporting its results for the three months to 31 December 2004, the..."
4,business,Pernod takeover talk lifts Domecq\n,\n Shares in UK drinks and food firm Allied Domecq have risen on speculation that it could be the target of a takeover by France's Pernod Ricard.\...


In [None]:
remove_line_break = lambda column: re.sub(r"[\n]", "", column)
convert_to_lowercase = lambda column : column.lower()
remove_punctuations = lambda column: re.sub(r"[.!\,?\"\\/':-]", "", column)
remove_words_inside_parentheses = lambda column: re.sub(r"[({\[].*[)}\]]", "", column)
remove_numbers = lambda column: re.sub(r"\d", "", column)
remove_other_signs = lambda column : re.sub(r"[%$][\w\.\,]*", "", column)
remove_extra_whitespaces = lambda column: re.sub(r"  +", " ", column)

In [None]:
def clean_string(column):
    column = column.apply(remove_line_break)
    column = column.apply(convert_to_lowercase)
    column = column.apply(remove_punctuations)
    column = column.apply(remove_words_inside_parentheses)
    column = column.apply(remove_numbers)
    column = column.apply(remove_other_signs)
    column = column.apply(remove_extra_whitespaces)
    return column

In [None]:
df['body'] = clean_string(df['body'])
df['title'] = clean_string(df['title'])

In [None]:
df.head()

Unnamed: 0,category,title,body
0,business,ad sales boost time warner profit,quarterly profits at us media giant timewarner jumped to which is close to concluding time warners fourth quarter profits were slightly better th...
1,business,dollar gains on greenspan speech,the dollar has hit its highest level against the euro in almost three months after the federal reserve head said the us trade deficit is set to s...
2,business,yukos unit buyer faces loan claim,the owners of embattled russian oil giant yukos are to ask the buyer of its former production unit to pay back a loan stateowned rosneft bought t...
3,business,high fuel prices hit bas profits,british airways has blamed high fuel prices for a drop in profits reporting its results for the three months to december the airline made a preta...
4,business,pernod takeover talk lifts domecq,shares in uk drinks and food firm allied domecq have risen on speculation that it could be the target of a takeover by frances pernod ricard repo...


In [None]:
df.to_csv("data/preprocessed/corpus.csv", index=False)

## Document Term-Matrix

In [None]:
df_body = df.drop("title", axis=1)
df_body.head()

Unnamed: 0,category,body
0,business,quarterly profits at us media giant timewarner jumped to which is close to concluding time warners fourth quarter profits were slightly better th...
1,business,the dollar has hit its highest level against the euro in almost three months after the federal reserve head said the us trade deficit is set to s...
2,business,the owners of embattled russian oil giant yukos are to ask the buyer of its former production unit to pay back a loan stateowned rosneft bought t...
3,business,british airways has blamed high fuel prices for a drop in profits reporting its results for the three months to december the airline made a preta...
4,business,shares in uk drinks and food firm allied domecq have risen on speculation that it could be the target of a takeover by frances pernod ricard repo...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words="english")
data_cv = cv.fit_transform(df_body.body)
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
data_dtm.index = df_body.category

In [None]:
data_dtm

Unnamed: 0_level_0,aa,aaa,aaas,aac,aadc,aaliyah,aaliyahs,aaltra,aamir,aaron,...,zones,zoom,zooropa,zornotza,zorro,zurich,zurichs,zutons,zvonareva,zvyagintsev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
business,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
business,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
business,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
business,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
business,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tech,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import pickle

data_dtm.to_pickle("data/preprocessed/data_dtm.pkl")
pickle.dump(cv, open("data/preprocessed/cv.pkl", "wb"))