In [8]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.collocations import BigramCollocationFinder
import collections

## 1. Process train data

In [15]:
# adding to open , encoding='utf-8' is usually a good thing
with open('training_docs.txt','r', encoding='utf-8') as f:
    raw_train = f.read()
    
train_list = raw_train.split('EOD\n\n')
for i in range(len(train_list)):
    l = train_list[i].strip().split("\n")
    train_list[i] = l

# extract document id and text content
IDs = []
Texts = []
for l in train_list:
    if len(l) == 2:
        IDs.append(l[0][3:])
        Texts.append(l[1][5:])

# create data frame of train data
data = pd.DataFrame()
data['ID'] = IDs
data['Text'] = Texts

In [16]:
# extract labels of train data
with open('training_labels_final.txt','r') as f:
    label = f.read()
    
labels = label.split('\n')

for i in range(len(labels)):
    l = labels[i].split(' ')
    labels[i] = l
    
# there are 23 classes in total
label = []
for i in range(len(IDs)):
    if (labels[i][0] == IDs[i]):
        label.append(labels[i][1])

# combine into train dataframe
data['label'] = label
data.to_csv("train.csv", index=False)

In [17]:
data

Unnamed: 0,ID,Text,label
0,tr_doc_1,Two German tourists have been found safe and w...,C1
1,tr_doc_2,ACT police have seized a rare drug during a ra...,C1
2,tr_doc_3,A 50-year-old Brisbane man has been charged wi...,C1
3,tr_doc_4,In-depth discussions are continuing to resolve...,C1
4,tr_doc_5,Homicide detectives are still questioning a ma...,C1
5,tr_doc_6,The Cole Royal Commission has recommended crim...,C1
6,tr_doc_7,The dramatic hijacking of a Turkish airliner t...,C1
7,tr_doc_8,The hijacker of a Turkish plane has surrendere...,C1
8,tr_doc_9,The judge overseeing the trial in the United S...,C1
9,tr_doc_10,At least 68 people have been killed and 34 inj...,C1


## 2. Process test data

In [18]:
# adding to open , encoding='utf-8' is usually a good thing
with open('testing_docs.txt','r', encoding='utf-8') as f:
    raw_test = f.read()
    
test_list = raw_test.split('EOD\n\n')
for i in range(len(test_list)):
    l = test_list[i].strip().split("\n")
    test_list[i] = l

# extract document id and text content
IDs = []
Texts = []
for l in test_list:
    if len(l) == 2:
        IDs.append(l[0][3:])
        Texts.append(l[1][5:])

# create data frame of train data
test = pd.DataFrame()
test['ID'] = IDs
test['Text'] = Texts

test.to_csv("test.csv", index=False)

## 3. Tokenizing, lemmatizing and stemming

In [11]:
# tokenize word
tokenizer = RegexpTokenizer(r"(?:\w+)+(?:[-'.](?:\w+)+)*")

# we should lower text first
tokenized = []
for text in Texts:
    tokenized.append(tokenizer.tokenize(text.lower()))

stopwords_list = set([stopword.encode('utf-8') for stopword in stopwords.words('english')])

filtered_list = []
for sent in tokenized:
    filtered_list.append([word.lower() for word in sent if str.encode(word.lower()) not in stopwords_list])

stemmer = SnowballStemmer('english')
stemmed = [[stemmer.stem(w) for w in para] for para in filtered_list]


[['two',
  'german',
  'tourists',
  'have',
  'been',
  'found',
  'safe',
  'and',
  'well',
  'after',
  'spending',
  'almost',
  'six',
  'hours',
  'lost',
  'in',
  'rugged',
  'rainforest',
  'at',
  'finch',
  'hatton',
  'gorge',
  'west',
  'of',
  'mackay',
  'last',
  'night',
  'it',
  'is',
  'the',
  'same',
  'area',
  'a',
  'young',
  'mackay',
  'man',
  'fell',
  'or',
  'jumped',
  'to',
  'his',
  'death',
  'last',
  'week',
  'sergeant',
  'jon',
  'purcell',
  'says',
  'rescuers',
  'located',
  'the',
  'missing',
  'pair',
  'just',
  'before',
  'midnight',
  'aest'],
 ['act',
  'police',
  'have',
  'seized',
  'a',
  'rare',
  'drug',
  'during',
  'a',
  'raid',
  'of',
  'a',
  'florey',
  'home',
  'police',
  'found',
  'a',
  'number',
  'of',
  'syringes',
  'filled',
  'with',
  'the',
  'drug',
  'ox-blood',
  'which',
  'is',
  'a',
  'form',
  'of',
  'amphetamine',
  'they',
  'also',
  'found',
  'a',
  'number',
  'of',
  'bags',
  'believed

In [25]:
lemmatizer = WordNetLemmatizer()
lemmatized = [[lemmatizer.lemmatize(w) for w in para] for para in stemmed]

wordlist = [word for para in lemmatized for word in set(para)]
word_counter = collections.Counter(wordlist)
most_freq = word_counter.most_common()[:20]

not_common = [word for word in word_counter if word_counter[word] == 1]

# remove the least and most frequence words
remove_words = set([word[0] for word in most_freq] + not_common)
remove_words

In [32]:
removed = []
for word in lemmatized:
    temp = []
    for x in word:
        if x not in remove_words:
            temp.append(x)
    removed.append(temp)
removed

[['german',
  'tourist',
  'found',
  'safe',
  'well',
  'spend',
  'almost',
  'six',
  'hour',
  'lost',
  'rug',
  'rainforest',
  'finch',
  'hatton',
  'gorg',
  'west',
  'mackay',
  'night',
  'area',
  'young',
  'mackay',
  'man',
  'fell',
  'jump',
  'death',
  'sergeant',
  'jon',
  'purcel',
  'rescuer',
  'locat',
  'miss',
  'pair',
  'midnight',
  'aest'],
 ['act',
  'polic',
  'seiz',
  'rare',
  'drug',
  'raid',
  'florey',
  'home',
  'polic',
  'found',
  'number',
  'syring',
  'fill',
  'drug',
  'form',
  'amphetamin',
  'found',
  'number',
  'bag',
  'believ',
  'contain',
  'crystal',
  'methamphetamin',
  '29-year-old',
  'woman',
  'charg',
  'number',
  'offenc',
  'face',
  'court',
  'morn',
  'act',
  'sergeant',
  'matt',
  'varley',
  'third',
  'drug',
  'found',
  'territori',
  'actual',
  'amphetamin',
  'manufactur',
  'process',
  'wherebi',
  'normal',
  'powder',
  'crystal',
  'produc',
  'liquid',
  'methamphetamin',
  'contain',
  'red',
 

In [33]:
wordlist = [word for para in removed for word in para]
finder = BigramCollocationFinder.from_words(wordlist)
finder.apply_freq_filter(10)
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigrams = finder.nbest(bigram_measures.pmi,200)
bigrams

[('vennegoor', 'hesselink'),
 ('dal', 'broi'),
 ('hizb', 'ut-tahrir'),
 ('osteiti', 'pubi'),
 ('polona', 'hercog'),
 ('red-ear', 'slider'),
 ('machu', 'picchu'),
 ('steed', 'malbranqu'),
 ('boko', 'haram'),
 ('silvija', 'talaja'),
 ('ukuma', "ta'ai"),
 ('week-in', 'week-out'),
 ('golgol', 'mebrahtu'),
 ('bnp', 'pariba'),
 ('manas', 'manuokafoa'),
 ('marsel', 'ilhan'),
 ('rhett', 'lockyear'),
 ('timea', 'bacsinszki'),
 ('luk', 'chai'),
 ('eidur', 'gudjohnsen'),
 ('qiangba', 'puncog'),
 ('burkina', 'faso'),
 ('elio', "d'amato"),
 ('rudra', 'pratap'),
 ('isaka', 'cernak'),
 ('nue', 'dae'),
 ('atal', 'behari'),
 ('bilbo', 'baggin'),
 ('danai', 'udomchok'),
 ('servet', 'uzunlar'),
 ('edinson', 'cavani'),
 ('bran', 'nue'),
 ('wairangi', 'koopu'),
 ('tenz', 'norgay'),
 ('taulima', 'tautai'),
 ('klaas-jan', 'huntelaar'),
 ('daja', 'bedanova'),
 ('iva', 'majoli'),
 ('kohlberg', 'kravi'),
 ('sharm', 'el-sheikh'),
 ('vassallo', 'arguello'),
 ('mirko', 'vucin'),
 ('romina', 'oprandi'),
 ('piyush',

In [35]:
for para in removed:
    for j in range(0,len(para)-1):
        temp_tuple = (para[j], para[j+1])
        if temp_tuple in bigrams:
            para[j] = para[j] + '_' + para[j+1]
            para[j+1] = None

[<filter at 0x7f1aecbae9b0>,
 <filter at 0x7f1aecbae978>,
 <filter at 0x7f1aecbaea20>,
 <filter at 0x7f1aecbaeac8>,
 <filter at 0x7f1aecbaeb38>,
 <filter at 0x7f1aecbaeba8>,
 <filter at 0x7f1aecbaec18>,
 <filter at 0x7f1aecbaec88>,
 <filter at 0x7f1aecbaecf8>,
 <filter at 0x7f1aecbaed68>,
 <filter at 0x7f1aecbaedd8>,
 <filter at 0x7f1aecbaee48>,
 <filter at 0x7f1aecbaeeb8>,
 <filter at 0x7f1aecbaef28>,
 <filter at 0x7f1aecbaef98>,
 <filter at 0x7f1aecbae518>,
 <filter at 0x7f1aecbae588>,
 <filter at 0x7f1aecbae6a0>,
 <filter at 0x7f1aecbae710>,
 <filter at 0x7f1afb2f8048>,
 <filter at 0x7f1afb2f80b8>,
 <filter at 0x7f1afb2f8128>,
 <filter at 0x7f1afb2f8198>,
 <filter at 0x7f1afb2f8208>,
 <filter at 0x7f1afb2f8278>,
 <filter at 0x7f1afb2f82e8>,
 <filter at 0x7f1afb2f8358>,
 <filter at 0x7f1afb2f83c8>,
 <filter at 0x7f1afb2f8438>,
 <filter at 0x7f1afb2f84a8>,
 <filter at 0x7f1afb2f8518>,
 <filter at 0x7f1afb2f8588>,
 <filter at 0x7f1afb2f85f8>,
 <filter at 0x7f1afb2f8668>,
 <filter at 0x

In [42]:
final = []
for para in removed:
    para = [x for x in para if x != None]
    final.append(para)
final

[['german',
  'tourist',
  'found',
  'safe',
  'well',
  'spend',
  'almost',
  'six',
  'hour',
  'lost',
  'rug',
  'rainforest',
  'finch',
  'hatton',
  'gorg',
  'west',
  'mackay',
  'night',
  'area',
  'young',
  'mackay',
  'man',
  'fell',
  'jump',
  'death',
  'sergeant',
  'jon',
  'purcel',
  'rescuer',
  'locat',
  'miss',
  'pair',
  'midnight',
  'aest'],
 ['act',
  'polic',
  'seiz',
  'rare',
  'drug',
  'raid',
  'florey',
  'home',
  'polic',
  'found',
  'number',
  'syring',
  'fill',
  'drug',
  'form',
  'amphetamin',
  'found',
  'number',
  'bag',
  'believ',
  'contain',
  'crystal',
  'methamphetamin',
  '29-year-old',
  'woman',
  'charg',
  'number',
  'offenc',
  'face',
  'court',
  'morn',
  'act',
  'sergeant',
  'matt',
  'varley',
  'third',
  'drug',
  'found',
  'territori',
  'actual',
  'amphetamin',
  'manufactur',
  'process',
  'wherebi',
  'normal',
  'powder',
  'crystal',
  'produc',
  'liquid',
  'methamphetamin',
  'contain',
  'red',
 

In [44]:
data['text'] = final
data.head()

Unnamed: 0,ID,Text,label,text
0,tr_doc_1,Two German tourists have been found safe and w...,C1,"[german, tourist, found, safe, well, spend, al..."
1,tr_doc_2,ACT police have seized a rare drug during a ra...,C1,"[act, polic, seiz, rare, drug, raid, florey, h..."
2,tr_doc_3,A 50-year-old Brisbane man has been charged wi...,C1,"[50-year-old, brisban, man, charg, fraud, alle..."
3,tr_doc_4,In-depth discussions are continuing to resolve...,C1,"[in-depth, discus, continu, resolv, safeti, co..."
4,tr_doc_5,Homicide detectives are still questioning a ma...,C1,"[homicid, detect, still, question, man, fatal,..."
