# <center>Data Mining Project 2 Spring semester 2019-2020</center>
## <center>Παναγιώτης Ευαγγελίου &emsp; 1115201500039</center>
## <center>Γεώργιος Μαραγκοζάκης &emsp; 1115201500089</center>

___

### Do all the necessary imports for this notebook

In [1]:
# data processing
import pandas as pd
from sklearn.model_selection import train_test_split

# visualization
from wordcloud import WordCloud
from IPython.display import Image

# classification
from sklearn.model_selection import KFold
from IPython.display import display

# for data exploration
import os
import numpy as np


## __Dataset Preprocessing__

- ### *Make tsv files from all the txt files*

In [2]:
myCategoriesFolder = ['business','entertainment','politics', 'sport', 'tech']
dataPathDir = './fulltext/data/'

myDataSetDf = pd.DataFrame(columns=['ID', 'TITLE',  'CONTENT',  'CATEGORY'])
id_count = 0

for category in myCategoriesFolder:
    specificPath = dataPathDir + category + '/'

    # find the column's names of each csv
    for fileName in os.listdir(specificPath):
        # we need to check only .txt files
        if fileName.endswith(".txt"):
            
            thisTxt = open(os.path.join(specificPath, fileName),"r")
            thisTxtTitle = thisTxt.readline()

            # get rid of '\n' on the end of title line
            thisTxtTitle = thisTxtTitle.replace('\n', '')

            thisTxtContent = thisTxt.readlines()

            # get rid of empty lines '\n'
            thisTxtContent = list(filter(lambda a: a != '\n', thisTxtContent))

            # get rid of '\n' on the end of each line 
            thisTxtContent = [period.replace('\n', '') for period in thisTxtContent]

            # convert list of lines into a single string line
            thisTxtContent = ' '.join(thisTxtContent)

            myDataSetDf = myDataSetDf.append({'ID': id_count, 'TITLE': thisTxtTitle, 'CONTENT': thisTxtContent, 'CATEGORY': category.upper()}, ignore_index=True)
            thisTxt.close()

            id_count += 1

display(myDataSetDf)

Unnamed: 0,ID,TITLE,CONTENT,CATEGORY
0,0,Giant waves damage S Asia economy,"Governments, aid agencies, insurers and travel...",BUSINESS
1,1,EMI shares hit by profit warning,Shares in music giant EMI have sunk by more th...,BUSINESS
2,2,Barclays shares up on merger talk,Shares in UK banking group Barclays have risen...,BUSINESS
3,3,Trial begins of Spain's top banker,"The trial of Emilio Botin, the chairman of Spa...",BUSINESS
4,4,MG Rover China tie-up 'delayed',MG Rover's proposed tie-up with China's top ca...,BUSINESS
...,...,...,...,...
2219,2219,DVD copy protection strengthened,DVDs will be harder to copy thanks to new anti...,TECH
2220,2220,Slimmer PlayStation triple sales,Sony PlayStation 2's slimmer shape has proved ...,TECH
2221,2221,Format wars could 'confuse users',"Technology firms Sony, Philips, Matsushita and...",TECH
2222,2222,Mobile TV tipped as one to watch,"Scandinavians and Koreans, two of the most adv...",TECH


## __Make wordcloud for each category__

In [3]:
def makeWordCloud(myText, saveLocationPath, myMaxWords=100, myMask=None, myStopWords=None):
    '''Default function for generating wordcloud'''

    wc = WordCloud(background_color="white", mask=myMask, max_words=myMaxWords,
                   stopwords=myStopWords, contour_width=3, contour_color='steelblue')

    # generate word cloud
    wc.generate(myText)

    # store to file

    wc.to_file(saveLocationPath)

    return saveLocationPath

- ### *Business Wordcloud*

In [4]:

# to fill


- ### *Entertainment Wordcloud*

In [5]:

# to fill


- ### *Politics Wordcloud*

In [6]:

# to fill


- ### *Sport Wordcloud*

In [7]:

# to fill


- ### *Tech Wordcloud*

In [8]:

# to fill


## __Classification__

- ### *Split DataSet into TrainData and TestData*

In [9]:
trainDataSet, testDataSet = train_test_split(myDataSetDf, test_size=0.2, stratify=myDataSetDf['CATEGORY'])

# reset index
trainDataSet.reset_index(drop=True, inplace=True)
testDataSet.reset_index(drop=True, inplace=True)

# save to tsv files
trainDataSet.to_csv('train_set.tsv', sep = '\t')

# save test_set categories
testDataSetCategories = testDataSet[['CATEGORY']].copy()

testDataSetCategories.to_csv('test_set_categories.tsv', sep = '\t')

testDataSet = testDataSet.drop('CATEGORY', axis=1)
testDataSet.to_csv('test_set.tsv', sep = '\t')

In [10]:
myDataSetDf

Unnamed: 0,ID,TITLE,CONTENT,CATEGORY
0,0,Giant waves damage S Asia economy,"Governments, aid agencies, insurers and travel...",BUSINESS
1,1,EMI shares hit by profit warning,Shares in music giant EMI have sunk by more th...,BUSINESS
2,2,Barclays shares up on merger talk,Shares in UK banking group Barclays have risen...,BUSINESS
3,3,Trial begins of Spain's top banker,"The trial of Emilio Botin, the chairman of Spa...",BUSINESS
4,4,MG Rover China tie-up 'delayed',MG Rover's proposed tie-up with China's top ca...,BUSINESS
...,...,...,...,...
2219,2219,DVD copy protection strengthened,DVDs will be harder to copy thanks to new anti...,TECH
2220,2220,Slimmer PlayStation triple sales,Sony PlayStation 2's slimmer shape has proved ...,TECH
2221,2221,Format wars could 'confuse users',"Technology firms Sony, Philips, Matsushita and...",TECH
2222,2222,Mobile TV tipped as one to watch,"Scandinavians and Koreans, two of the most adv...",TECH


In [11]:
trainDataSet

Unnamed: 0,ID,TITLE,CONTENT,CATEGORY
0,113,Lufthansa flies back to profit,German airline Lufthansa has returned to profi...,BUSINESS
1,499,German economy rebounds,"Germany's economy, the biggest among the 12 co...",BUSINESS
2,589,Sir Paul rocks Super Bowl crowds,Sir Paul McCartney wowed fans with a live mini...,ENTERTAINMENT
3,2033,Mobiles 'not media players yet',"Mobiles are not yet ready to be all-singing, a...",TECH
4,592,US composer recreates Bach score,A US musicologist has recreated a lost musical...,ENTERTAINMENT
...,...,...,...,...
1774,467,Malaysia lifts Islamic bank limit,Malaysia's central bank is to relax restrictio...,BUSINESS
1775,1577,Federer joins all-time greats,The last year has seen one player dominate men...,SPORT
1776,1533,Melzer shocks Agassi in San Jose,Second seed Andre Agassi suffered a comprehens...,SPORT
1777,723,Housewives lift Channel 4 ratings,The debut of US television hit Desperate House...,ENTERTAINMENT


In [12]:
testDataSet

Unnamed: 0,ID,TITLE,CONTENT
0,1773,Lewis-Francis eyeing world gold,Mark Lewis-Francis says his Olympic success ha...
1,2038,Xbox power cable 'fire fear',Microsoft has said it will replace more than 1...
2,2110,Apple sues 'Tiger' file sharers,Apple has taken more legal action to stop onli...
3,856,Bets off after Big Brother 'leak',A bookmaker has stopped taking bets on Celebri...
4,2106,Blogger grounded by her airline,A US airline attendant is fighting for her job...
...,...,...,...
440,478,Saudi ministry to employ women,Women will be employed in Saudi Arabia's forei...
441,791,German music in a 'zombie' state,The German music business - the third largest ...
442,1051,What really divides the parties,So what is the gap between Labour and the Tori...
443,700,Bangkok film festival battles on,Organisers of the third Bangkok International ...


In [13]:
testDataSetCategories

Unnamed: 0,CATEGORY
0,SPORT
1,TECH
2,TECH
3,ENTERTAINMENT
4,TECH
...,...
440,BUSINESS
441,ENTERTAINMENT
442,POLITICS
443,ENTERTAINMENT


In [14]:
# use 10-fold cross validation
kf = KFold(n_splits=10)

In [15]:
kf

KFold(n_splits=10, random_state=None, shuffle=False)