In [1]:
import csv
import collections
import itertools
import numpy as np
import pandas as pd
import string
import re
import os
from sklearn.model_selection import train_test_split  

In [3]:
allevents = os.listdir('Stats/csv')

In [18]:
for eachevent in allevents:
    print("Reading:", eachevent)
    df = pd.read_csv('Stats/csv/'+eachevent)
    df['difference'] = abs(df['Negative(0) class'] - df['Positive(1) class'])
    df['total'] = abs(df['Negative(0) class'] + df['Positive(1) class'])
    print(df.iloc[df['difference'].idxmin()])
    print("------------------------------------------------------------------")
    

Reading: chileEarthquake2014.csv
Class Name           ThirdPartyObservation
Negative(0) class                      157
Positive(1) class                      154
difference                               3
total                                  311
Name: 1, dtype: object
------------------------------------------------------------------
Reading: typhoonYolanda2013.csv
Class Name           Hashtags
Negative(0) class         261
Positive(1) class         303
difference                 42
total                     564
Name: 13, dtype: object
------------------------------------------------------------------
Reading: manilaFloods2013.csv
Class Name           Irrelevant
Negative(0) class           260
Positive(1) class           151
difference                  109
total                       411
Name: 6, dtype: object
------------------------------------------------------------------
Reading: albertaFloods2013.csv
Class Name           ContinuingNews
Negative(0) class               369
Positi

### Unpack GloVe

In [2]:
filename = 'glove.twitter.27B/glove.twitter.27B.100d.txt'

In [3]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [4]:
twittervocab = loadGloveModel(filename)

Loading Glove Model
Done. 1193515  words loaded!


In [5]:
np.random.seed(120)
randvec = np.random.rand(100)

## Get features

In [6]:
def ptext(text): return re.findall("[a-zA-Z']+", text.lower())

def getwordvec(dataset, maximum_len):
    data = []
    for eachsent in dataset:
        sentence = [twittervocab[word] if word in twittervocab.keys() else randvec for word in eachsent]
        if len(sentence) < maximum_len:
            sentence.extend([[0]*100 for _ in range(maximum_len-len(sentence))])
        data.append(sentence)
    return np.array(data)

def process(filename):
    df = pd.read_csv('Features/'+filename).drop('Unnamed: 0', axis=1)
    common = ['tweetids', 'topics', 'alltopics', 'categories','cleanedtweets']
    columns = [name for name in list(df.columns) if name not in common]
    tweetsdata = df['cleanedtweets'].tolist()
    tweetdata = [ptext(eachtweet) for eachtweet in tweetsdata]
    folder = filename.split('.')[0]
    print("Reading each labels from file: ", filename)
    print("--------------------------------------------------------------")
    os.mkdir('datasets/'+folder)
    
    maximum_len = max([len(i) for i in tweetdata])
    minimum_len = min([len(i) for i in tweetdata])
    print("Maximum length: ", maximum_len)
    print("--------------------------------------------------------------")
    for each_label in columns:
        labels = df[each_label]
        X_train, X_test, y_train, y_test = train_test_split(tweetdata, labels, test_size = 0.20)  
        # train
        x_train = getwordvec(X_train, maximum_len)
        
        new_x_train = np.reshape(x_train, (-1, x_train.shape[1]*x_train.shape[2]))
        # test
        x_test = getwordvec(X_test, maximum_len)

        new_x_test = np.reshape(x_test,(-1, x_test.shape[1]*x_test.shape[2]))
        
        with open('datasets/'+folder+'/'+each_label+'_train_'+str(maximum_len)+'00.txt','w') as fp:
            for left, right in zip(y_train, new_x_train):
                new_right = ' '.join([str(index+1)+':'+str(value) for index, value in enumerate(right)])
                left = -1 if left == 0 else left
                feat = str(left)+' '+str(new_right)
                fp.write("%s\n" %(feat))
                
        with open('datasets/'+folder+'/'+each_label+'_test_'+str(maximum_len)+'00.txt','w') as fp:
            for left, right in zip(y_test, new_x_test):
                new_right = ' '.join([str(index+1)+':'+str(value) for index, value in enumerate(right)])
                left = -1 if left == 0 else left
                feat = str(left)+' '+str(new_right)
                fp.write("%s\n" %(feat))
        print("Finished processing label: ", each_label)
        print("--------------------------------------------------------------")

In [7]:
files = os.listdir('Features/')

In [8]:
for file in files: 
    process(file)
    print("********************************************************************")

Reading each labels from file:  Shooting_features.csv
--------------------------------------------------------------
Maximum length:  29
--------------------------------------------------------------
Finished processing label:  MultimediaShare
--------------------------------------------------------------
Finished processing label:  Hashtags
--------------------------------------------------------------
Finished processing label:  PastNews
--------------------------------------------------------------
Finished processing label:  Sentiment
--------------------------------------------------------------
Finished processing label:  Discussion
--------------------------------------------------------------
Finished processing label:  KnownAlready
--------------------------------------------------------------
Finished processing label:  Factoid
--------------------------------------------------------------
Finished processing label:  EmergingThreats
-------------------------------------------

Finished processing label:  FirstPartyObservation
--------------------------------------------------------------
Finished processing label:  ContinuingNews
--------------------------------------------------------------
Finished processing label:  EmergingThreats
--------------------------------------------------------------
Finished processing label:  Official
--------------------------------------------------------------
Finished processing label:  Hashtags
--------------------------------------------------------------
Finished processing label:  KnownAlready
--------------------------------------------------------------
Finished processing label:  ServiceAvailable
--------------------------------------------------------------
Finished processing label:  Irrelevant
--------------------------------------------------------------
Finished processing label:  MultimediaShare
--------------------------------------------------------------
Finished processing label:  Discussion
--------------

Finished processing label:  KnownAlready
--------------------------------------------------------------
Finished processing label:  EmergingThreats
--------------------------------------------------------------
Finished processing label:  MultimediaShare
--------------------------------------------------------------
Finished processing label:  ServiceAvailable
--------------------------------------------------------------
Finished processing label:  PastNews
--------------------------------------------------------------
Finished processing label:  Sentiment
--------------------------------------------------------------
Finished processing label:  Donations
--------------------------------------------------------------
Finished processing label:  Volunteer
--------------------------------------------------------------
Finished processing label:  Discussion
--------------------------------------------------------------
Finished processing label:  Weather
---------------------------------

Finished processing label:  EmergingThreats
--------------------------------------------------------------
Finished processing label:  InformationWanted
--------------------------------------------------------------
Finished processing label:  ServiceAvailable
--------------------------------------------------------------
Finished processing label:  Donations
--------------------------------------------------------------
Finished processing label:  CleanUp
--------------------------------------------------------------
Finished processing label:  Volunteer
--------------------------------------------------------------
Finished processing label:  GoodsServices
--------------------------------------------------------------
Finished processing label:  PastNews
--------------------------------------------------------------
Finished processing label:  SearchAndRescue
--------------------------------------------------------------
Finished processing label:  Unknown
---------------------------

Finished processing label:  Donations
--------------------------------------------------------------
Finished processing label:  MultimediaShare
--------------------------------------------------------------
Finished processing label:  Volunteer
--------------------------------------------------------------
Finished processing label:  GoodsServices
--------------------------------------------------------------
Finished processing label:  MovePeople
--------------------------------------------------------------
Finished processing label:  Official
--------------------------------------------------------------
Finished processing label:  ServiceAvailable
--------------------------------------------------------------
Finished processing label:  CleanUp
--------------------------------------------------------------
Finished processing label:  InformationWanted
--------------------------------------------------------------
Finished processing label:  SignificantEventChange
-----------------

### Seperating events

In [175]:
featuredata_name = ['Earthquake_features.csv', 'typhoon_features.csv', 'Floods_features.csv']

In [177]:
for f in featuredata_name:
    df = pd.read_csv('Features/'+f).drop('Unnamed: 0', axis=1)
    topics = list(set(df['topics']))
    for i in range(len(topics)):
        print("Reading topic: ", topics[i])
        df[df['topics'] == topics[i]].to_csv('Features/'+topics[i]+'.csv')
    print("-----------------------------------------------------------------------")

Reading topic:  guatemalaEarthquake2012
Reading topic:  chileEarthquake2014
Reading topic:  nepalEarthquake2015
Reading topic:  italyEarthquakes2012
-----------------------------------------------------------------------
Reading topic:  typhoonYolanda2013
Reading topic:  typhoonHagupit2014
-----------------------------------------------------------------------
Reading topic:  queenslandFloods2013
Reading topic:  philipinnesFloods2012
Reading topic:  albertaFloods2013
Reading topic:  manilaFloods2013
-----------------------------------------------------------------------


In [167]:
topics = list(set(df['topics']))

#### LIBSVM

In [51]:
from collections import Counter

In [87]:
with open('GADGET/GADGET/data/mnist/mnist.scale.t','r') as fp:
    newcontent = fp.read().splitlines()

In [88]:
newcontent = [content[i] for i in range(len(content)) if int(content[i][0]) < 2]

In [89]:
len(newcontent)

2115

In [82]:
newcontent = ['-1'+each[1:] if int(each[0]) < 5 else '1'+each[1:]  for each in newcontent]

In [71]:
with open('GADGET/GADGET/data/mnist/testdata.txt','w') as fp:
    for each in newcontent:
        fp.write("%s\n" %each)

In [72]:
Counter([each.split()[0] for each in newcontent])

Counter({'1': 4861, '-1': 5139})

In [84]:
type(newcontent[10])

str

In [90]:
newcontent[:10]

['1 129:0.14902 130:0.996078 131:0.427451 157:0.341176 158:0.988235 159:0.321569 185:0.529412 186:0.945098 212:0.176471 213:0.956863 214:0.588235 240:0.329412 241:0.996078 242:0.247059 268:0.792157 269:0.87451 270:0.0431373 295:0.12549 296:0.996078 297:0.847059 323:0.372549 324:0.996078 325:0.764706 351:0.54902 352:0.996078 353:0.301961 378:0.223529 379:0.929412 380:0.803922 381:0.0313725 406:0.486275 407:1 408:0.647059 434:0.670588 435:0.996078 436:0.317647 461:0.0941176 462:0.909804 463:0.843137 489:0.470588 490:0.996078 491:0.623529 517:0.592157 518:0.996078 519:0.556863 545:0.894118 546:0.996078 547:0.258824 572:0.239216 573:0.984314 574:0.996078 575:0.258824 600:0.552941 601:0.996078 602:0.803922 603:0.0117647 627:0.0392157 628:0.843137 629:0.996078 630:0.47451 655:0.0196078 656:0.776471 657:0.690196 658:0.0392157 ',
 '0 125:0.0431373 126:0.588235 127:0.992157 128:0.792157 129:0.121569 153:0.145098 154:0.984314 155:0.984314 156:0.992157 157:0.419608 180:0.0823529 181:0.772549 182: