# Tools

In [638]:
import pandas as pd
import numpy as np
from collections import defaultdict as dd
from collections import Counter
from random import randint
from copy import deepcopy, copy

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_classif

from nltk.corpus import stopwords
from scipy import sparse


# Helper functions

In [639]:
sample = '2019S1-proj2-data/dev-raw.tsv'

def readData(filename):
    with open(filename,'r') as tsv:
        data = [line.strip().split('\t') for line in tsv]

    dataDict = {"labels": [], "tweets": [], "Id": []}
    for line in data:
        dataDict["Id"].append(line[0])
        dataDict["labels"].append(line[1])
        dataDict["tweets"].append(line[2])
    
    return dataDict
    

In [640]:
def preprocess(data):
    
    preprocessedData = []
    
    for e in data:
        ePreprocessed = e.strip().split(" ")
        for term in e:
            if term[0] == "@":
                ePreprocessed += [term]*3
        ePreprocessed = [f.lower() for f in ePreprocessed]
        preprocessedData.append(ePreprocessed)
    
    return preprocessedData

In [641]:
def WLH(data, labels):
    
    assert(len(data)==len(labels))
    
    termLocFrequencies = dd(Counter)
    totalTermsLoc = Counter()
    totalTermsAll = 0
    termCounts = Counter()
    
    for i in range(0, len(data)):
        terms = data[i]
        loc = labels[i]
        
        for term in terms:
            termLocFrequencies["".join(filter(str.isalnum,term))][loc] += 1
            totalTermsLoc[loc] += 1
            totalTermsAll += 1
            termCounts[term] += 1
    
    wlh = {}
    wlhCountRestricted = {}
    
    for term in termLocFrequencies.keys():
        
        denom = sum(termLocFrequencies[term].values())/totalTermsAll
        num = max([(termLocFrequencies[term][city]/totalTermsLoc[city], city) if totalTermsLoc[city] != 0 else (0, city) for city in ("Melbourne", "Sydney", "Perth", "Brisbane")])
        
        wlh[term] = num[0]/denom
        if termCounts[term] > 4:
            wlhCountRestricted[term] = num[0]/denom
    
    return wlh, wlhCountRestricted

In [642]:
def featureSelect(wlh):
    
#     wlhSorted = sorted([(y,x) for (x,y) in wlh.items()], reverse=True)
#     wlhBest = wlhSorted[0:int(len(wlh.keys())/3)]
    wlhBest = [(y,x) for (x,y) in wlh.items() if y>=2.5]
    wlhBestSet = set([a[1] for a in wlhBest])
    indexes = {}
    for i in range(0, len(wlhBest)):
        indexes[wlhBest[i][1]] = i
    
    return wlhBestSet, indexes

In [643]:
def encode(data, wlhBestSet):
    finalData = []
    
    for e in data:
        encoded = np.zeros(len(wlhBestSet))
        for term in e:
            if term in wlhBestSet:
                encoded[indexes[term]] += 1
                
        finalData.append(encoded)
    
    return finalData

In [644]:
import re
def getLocation(tweet):
    m = re.search("@\s(([a-zA-Z]+(\s|,|\))))+", tweet)
    if m:
        loc = re.sub("([^\w]|[\d_])+", " ",  m.group(0).lower()).strip()
        return loc
    else:
        return None

In [645]:
def buildLocationDict(data, labels):
    assert(len(data) == len(labels))
    locDict = {}
    for i in range(0, len(data)):
        locTag = getLocation(data[i])
        if locTag:
            locDict[locTag] = labels[i]
    return locDict           

In [646]:
def getUsers(tweet):
    return re.findall("(@[\w\d]+)", tweet)

In [647]:
def tweetAugment(data, labels):
    
    assert(len(data)==len(labels))
    
    augmentedDataDict = {"tweets": [], "labels": copy(labels)}
    
    for tweet in data:
        new = copy(tweet)
        
        location = getLocation(tweet)
        users = getUsers(tweet)
        
        for user in users:
            new += (" " + user)*4
        
        if location:
            new += (" " + location)*4
        
        augmentedDataDict["tweets"].append(new)

    return augmentedDataDict

In [648]:
def readyDataSet(filename, wlhSet=None, augment=False):
    dataDict = readData(filename)
    originalDict = deepcopy(dataDict)
    locDict = buildLocationDict(dataDict["tweets"], dataDict["labels"])
    
    if augment:
        dataDict = tweetAugment(dataDict["tweets"], dataDict["labels"])
    dataDict["tweets"] = preprocess(dataDict["tweets"])
    
    if not wlhSet:
        wlh = WLH(dataDict["tweets"], dataDict["labels"])
        wlhBestSet, indexes = featureSelect(wlh)
    else:
        wlhBestSet = wlhSet
        wlh = None

    for i in range(0, len(dataDict["tweets"])):
        tweet = dataDict["tweets"][i]
        new = []
        for term in tweet:
            if "".join(filter(str.isalnum, term.lower())) in wlhBestSet:
                new.append("".join(filter(str.isalnum, term.lower())))

        dataDict["tweets"][i] = " ".join(new)
        
    return dataDict, wlhBestSet, locDict, originalDict

# Our principle model: Naive Bayes and data enhancement

In [649]:
dataDict = readData('2019S1-proj2-data/train-raw.tsv')
augmentedDataDict = tweetAugment(dataDict["tweets"], dataDict["labels"])

In [650]:
devDataDict = readData('2019S1-proj2-data/dev-raw.tsv')
augmentedDevDataDict = tweetAugment(devDataDict["tweets"], devDataDict["labels"])

In [651]:
wlh, wlhCountRestricted = WLH([x.split(" ") for x in dataDict["tweets"]], dataDict["labels"])

In [652]:
topWlh = [x.lower() for (x,y) in wlhCountRestricted.items() if y > 2.75]

In [653]:
def augmentTestData(data, topWlh):
    new = []
    for i in range(0, len(data)):
        tweet = data[i]
        newTweet = copy(data[i])
        for j in topWlh:
            if j in tweet.lower():
                newTweet += (" " + j)
        new.append(newTweet)
    return new

In [654]:
geotaggerNBMultinomial5 = Pipeline([('count', CountVectorizer()), ('classifier', MultinomialNB()),])
geotaggerNBMultinomial5.fit(augmentTestData(augmentedDataDict["tweets"], topWlh), augmentedDataDict["labels"])

print("accuracy with dev set augmentation: {}".format(geotaggerNBMultinomial5.score(augmentedDevDataDict["tweets"], augmentedDevDataDict["labels"])))
print("accuracy without dev set augmentation: {}".format(geotaggerNBMultinomial5.score(devDataDict["tweets"], devDataDict["labels"])))

print("accuracy with new dev set augmentation: {}".format(geotaggerNBMultinomial5.score(augmentTestData(augmentedDevDataDict["tweets"], topWlh), devDataDict["labels"])))



accuracy with dev set augmentation: 0.34939436166791726
accuracy without dev set augmentation: 0.3484296280415907
accuracy with new dev set augmentation: 0.3624986600921857


In [636]:
predictions = geotaggerNBMultinomial5.predict(augmentTestData(augmentedTestDataDict["tweets"], topWlhComb))
output = pd.DataFrame({"Id": augmentedTestDataDict["Id"], "Class": predictions})
output.to_csv("primary-model.tsv",index=False)

### Error analysis

In [633]:
a = pd.Series(geotaggerNBMultinomial5.predict(augmentTestData(augmentedDevDataDict["tweets"], topWlh))).eq(pd.Series(devDataDict["labels"]))
[devDataDict["tweets"][i] for i in a.index[a==False].tolist()[0:100]]

['"@KellyFrye @girlscouts when life gives you cookies... \\ud83c\\udf6a\\ud83c\\udf6a\\ud83c\\udf6a\\ud83c\\udf6a"',
 '"@noviarezki naaaah itu dia hahaha"',
 '"#LouisWhyAreYouAnEGG  bro whyyy??? http://t.co/i4AhvJHPbj"',
 '"\\"Twt: 20 creepy kids\' drawings that will haunt your nightmares... http://t.co/4WUa8JHAdy http://t.co/CwXSFRtuDL\\""',
 '"Temp: 21.5\\u00b0C. Wind:11.5km/h. Pressure: 1019.7 hPa, Falling slowly. Humidity 55%. Rain Today 0.0mm. #HammondPark #Weather"',
 '"In situ... http://t.co/X96m46OuOL"',
 '"An out-of-sessions court hearing will be held very shortly in relation to stabbing murder of 17 year old Masa Vukotic.@TheTodayShow"',
 '"T-MINUS 24 HOURS #igsg #\\u8981graduate\\u54af @ Surfers Paradise, Gold Coast http://t.co/q1Hs8MOOc1"',
 '"S P O T L I G H T \\ud83d\\udd06 we feel it\'s a public service announcement to let you all know that the amazing\\u2026 https://t.co/i2PrPPqXXH"',
 '"@LTUcareers Trying to sign up for Career Ready Courses. I get redirected to LMS wit

### Kaggle submit code
Combine test and dev sets for training for optimal ranking

In [555]:
wlhComb, wlhCountRestrictedComb = WLH([x.split(" ") for x in (dataDict["tweets"]+devDataDict["tweets"])], dataDict["labels"]+devDataDict["labels"])


In [556]:
topWlhComb = [x.lower() for (x,y) in wlhCountRestrictedComb.items() if y > 3]

In [568]:
dataDictCombined = {"tweets": dataDict["tweets"]+devDataDict["tweets"], "labels": dataDict["labels"]+devDataDict["labels"]}
dataDictCombinedAugmented = tweetAugment(dataDictCombined["tweets"], dataDictCombined["labels"])
geotaggerNBMultinomial6 = Pipeline([("tfidf", TfidfVectorizer()), ('classifier', MultinomialNB()),])
geotaggerNBMultinomial6.fit(augmentTestData(dataDictCombinedAugmented["tweets"], topWlhComb), dataDictCombined["labels"])

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [569]:
testDataDict = readData('2019S1-proj2-data/test-raw.tsv')
augmentedTestDataDict = tweetAugment(testDataDict["tweets"], testDataDict["labels"])
augmentedTestDataDict["Id"] = testDataDict["Id"]

In [571]:
predictions = geotaggerNBMultinomial6.predict(augmentTestData(augmentedTestDataDict["tweets"], topWlhComb))
output = pd.DataFrame({"Id": augmentedTestDataDict["Id"], "Class": predictions})
output.to_csv("kagglePredictionsNew.csv",index=False)

# Testing for report

In [614]:
trainDataDict = readData('2019S1-proj2-data/train-raw.tsv')
devDataDict = readData('2019S1-proj2-data/dev-raw.tsv')

#### Baseline naive bayes

In [615]:
baselineClassifier = Pipeline([("count", CountVectorizer()), ('classifier', MultinomialNB()),])
baselineClassifier.fit(trainDataDict["tweets"], trainDataDict["labels"])
baselineClassifier.score(devDataDict["tweets"], devDataDict["labels"])

0.34229285025190265

#### Tfidf models

In [637]:
tfidfNB = Pipeline([("tfidf", TfidfVectorizer()), ('classifier', MultinomialNB()),])
tfidfNB.fit(trainDataDict["tweets"], trainDataDict["labels"])
tfidfNB.score(devDataDict["tweets"], devDataDict["labels"])
predictions = tfidfNB.predict(testDataDict["tweets"])
output = pd.DataFrame({"Id": augmentedTestDataDict["Id"], "Class": predictions})
output.to_csv("simple-model.tsv",index=False)

In [617]:
tfidfLinearSVC = Pipeline([("tfidf", TfidfVectorizer()), ('classifier', LinearSVC()),])
tfidfLinearSVC.fit(trainDataDict["tweets"], trainDataDict["labels"])
tfidfLinearSVC.score(devDataDict["tweets"], devDataDict["labels"])

0.3408189516561261

In [618]:
tfidfLogit = Pipeline([("tfidf", TfidfVectorizer()), ('classifier', LogisticRegression()),])
tfidfLogit.fit(trainDataDict["tweets"], trainDataDict["labels"])
tfidfLogit.score(devDataDict["tweets"], devDataDict["labels"])



0.33537892592989604

In [620]:
tfidfDT = Pipeline([("tfidf", TfidfVectorizer()), ('classifier', tree.DecisionTreeClassifier()),])
tfidfDT.fit(trainDataDict["tweets"], trainDataDict["labels"])
tfidfDT.score(devDataDict["tweets"], devDataDict["labels"])

0.31262729124236255

#### Better preprocessing techniques

In [623]:
preprocessLinearSVC = Pipeline([('count', CountVectorizer()), ('classifier', LinearSVC(max_iter=2000)),])
preprocessLinearSVC.fit(augmentTestData(augmentedDataDict["tweets"], topWlh), augmentedDataDict["labels"])

print("accuracy with new dev set augmentation: {}".format(preprocessLinearSVC.score(augmentTestData(augmentedDevDataDict["tweets"], topWlh), devDataDict["labels"])))

accuracy with new dev set augmentation: 0.33570050380533817


In [622]:
preprocessLogit = Pipeline([('count', CountVectorizer()), ('classifier', LogisticRegression()),])
preprocessLogit.fit(augmentTestData(augmentedDataDict["tweets"], topWlh), augmentedDataDict["labels"])

print("accuracy with new dev set augmentation: {}".format(preprocessLogit.score(augmentTestData(augmentedDevDataDict["tweets"], topWlh), devDataDict["labels"])))



accuracy with new dev set augmentation: 0.3534408832672312


In [624]:
preprocessDT = Pipeline([('count', CountVectorizer()), ('classifier', tree.DecisionTreeClassifier()),])
preprocessDT.fit(augmentTestData(augmentedDataDict["tweets"], topWlh), augmentedDataDict["labels"])

print("accuracy with new dev set augmentation: {}".format(preprocessDT.score(augmentTestData(augmentedDevDataDict["tweets"], topWlh), devDataDict["labels"])))

accuracy with new dev set augmentation: 0.31905884875120594


# Voting

In [672]:
def voting(dataDict):
    finalOutput = []
    
    svcPredict = geotaggerLinearSVC.predict(dataDict["tweets"])
    nbPredict = geotaggerNBMultinomial.predict(dataDict["tweets"])
    logitPredict = geotaggerLogit.predict(dataDict["tweets"])
    rfPredict = geotaggerRF.predict(dataDict["tweets"])
    dtPredict = geotaggerDT.predict(dataDict["tweets"])
    
    for i in range(0, len(dataDict["tweets"])):
        votes = Counter()
        
        votes[svcPredict[i]] += 1*0.75
        votes[nbPredict[i]] += 1
        votes[logitPredict[i]] += 1*0.55
        votes[rfPredict[i]] += 1*0.4
        votes[dtPredict[i]] += 1*0.3
        
        optimal = []
        maxVotes = 0
        
        for label in votes.keys():
            if votes[label] > maxVotes:
                maxVotes = votes[label]
                optimal = [label]
            elif votes[label] == maxVotes:
                optimal.append(label)
        
        if len(optimal) > 1:
            finalOutput.append(optimal[randint(0,len(optimal)-1)])
        else:
            finalOutput.append(optimal[0])

    return finalOutput

In [655]:
geotaggerNBMultinomial = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', MultinomialNB()),])
geotaggerNBMultinomial.fit(dataDict["tweets"], dataDict["labels"])

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [656]:
geotaggerLinearSVC = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', LinearSVC()),])
geotaggerLinearSVC.fit(dataDict["tweets"], dataDict["labels"])

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [657]:
geotaggerLogit = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', LogisticRegression()),])
geotaggerLogit.fit(dataDict["tweets"], dataDict["labels"])



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [660]:
geotaggerRF = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier()),])
geotaggerRF.fit(dataDict["tweets"], dataDict["labels"])



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [668]:
geotaggerDT = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', tree.DecisionTreeClassifier()),])
geotaggerDT.fit(dataDict["tweets"], dataDict["labels"])

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [674]:
votePredictions = voting(devDataDict)
print("vote accuracy: {}".format(pd.Series(votePredictions).eq(pd.Series(dataDict["labels"])).sum()/len(votePredictions)))

vote accuracy: 0.2490352663736735


In [676]:
predictions = voting(testDataDict)
output = pd.DataFrame({"Id": augmentedTestDataDict["Id"], "Class": predictions})
output.to_csv("voting-model.tsv",index=False)

# Bonus Kaggle stuff

In [682]:
def WLHNew(data, labels):
    
    assert(len(data)==len(labels))
    
    termLocFrequencies = dd(Counter)
    totalTermsLoc = Counter()
    totalTermsAll = 0
    termCounts = Counter()
    vocabulary = set()
    
    for i in range(0, len(data)):
        terms = data[i]
        loc = labels[i]
        
        for term in terms:
            termLocFrequencies["".join(filter(str.isalnum,term))][loc] += 1
            totalTermsLoc[loc] += 1
            totalTermsAll += 1
            termCounts[term] += 1
            vocabulary.add(term.lower())
    
    wlh = {}
    wlhCountRestricted = {}
    
    for term in termLocFrequencies.keys():
        
        denom = sum(termLocFrequencies[term].values())/totalTermsAll
        num = max([(termLocFrequencies[term][city]/totalTermsLoc[city], city) if totalTermsLoc[city] != 0 else (0, city) for city in ("Melbourne", "Sydney", "Perth", "Brisbane")])
        
        wlh[term] = num[0]/denom
        if termCounts[term] > 4:
            wlhCountRestricted[term] = num[0]/denom
    
    return wlh, wlhCountRestricted, vocabulary

In [678]:
dataDict = readData('2019S1-proj2-data/train-raw.tsv')
augmentedDataDict = tweetAugment(dataDict["tweets"], dataDict["labels"])

In [679]:
devDataDict = readData('2019S1-proj2-data/dev-raw.tsv')
augmentedDevDataDict = tweetAugment(devDataDict["tweets"], devDataDict["labels"])

In [683]:
wlh, wlhCountRestricted, vocabulary = WLHNew([x.split(" ") for x in dataDict["tweets"]], dataDict["labels"])

In [685]:
topWlh = [x.lower() for (x,y) in wlhCountRestricted.items() if y > 2.75]

In [699]:
def augmentTestDataNew(data, topWlh, vocabulary, wlh):
    new = []
    wlhTerms = set(wlh.keys())
    for i in range(0, len(data)):
        tweet = data[i]
        newTweet = copy(data[i])
        for j in topWlh:
            if j in tweet.lower():
                newTweet += (" " + j)
            for term in newTweet.split(" "):
                if term in wlhTerms and wlh[term] > 2.5 and term in j:
                    newTweet += (" " + j)
        new.append(newTweet)
    return new

In [702]:
geotaggerNBMultinomial5 = Pipeline([('count', CountVectorizer()), ('classifier', MultinomialNB()),])
geotaggerNBMultinomial5.fit(augmentTestDataNew(augmentedDataDict["tweets"], topWlh, vocabulary, wlh), augmentedDataDict["labels"])

print("accuracy with dev set augmentation: {}".format(geotaggerNBMultinomial5.score(augmentedDevDataDict["tweets"], augmentedDevDataDict["labels"])))
print("accuracy without dev set augmentation: {}".format(geotaggerNBMultinomial5.score(devDataDict["tweets"], devDataDict["labels"])))

print("accuracy with new dev set augmentation: {}".format(geotaggerNBMultinomial5.score(augmentTestData(augmentedDevDataDict["tweets"], topWlh), devDataDict["labels"])))


accuracy with dev set augmentation: 0.34459749169257153
accuracy without dev set augmentation: 0.3444635009111373
accuracy with new dev set augmentation: 0.36014042233894306


In [700]:
print("accuracy with new new dev set augmentation: {}".format(geotaggerNBMultinomial5.score(augmentTestDataNew(augmentedDevDataDict["tweets"], topWlh, vocabulary, wlh), devDataDict["labels"])))


accuracy with new new dev set augmentation: 0.32211383856790654
