In [10]:
import json
import csv
import nltk

states = ["ak","al","ar","az","ca","co","ct","de","fl","ga","hi","ia","id","il",
          "in","ks","ky","la","ma","md","me","mi","mn","mo","ms","mt","nc","nd","ne","nh",
          "nj","nm","nv","ny","oh","ok","or","pa","ri","sc","sd","tn","tx","ut","va","vt",
          "wa","wi","wv","wy"]

states_names = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado',
'CT': 'Connecticut', 'DE': 'Delaware', 'DC': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia',
'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky',
'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota',
'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire',
'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC':'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio',
'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota',
'TN':'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington','WV': 'West Virginia',
'WI': 'Wisconsin', 'WY': 'Wyoming', 'PR': 'Puerto Rico'}

states_codes = {'AL': 1, 'AK': 2, 'AZ': 4, 'AR': 5, 'CA': 6, 'CO': 8,
'CT': 9, 'DE': 10, 'DC': 11, 'FL': 12, 'GA': 13, 'HI': 15, 'ID': 16, 'IL': 17,
'IN': 18, 'IA': 19, 'KS': 20, 'KY': 21, 'LA': 22, 'ME': 23, 'MD': 24,
'MA': 25, 'MI': 26, 'MN': 27, 'MS': 28, 'MO': 29, 'MT': 30,
'NE': 31, 'NV': 32, 'NH': 33, 'NJ': 34, 'NM': 35, 'NY': 36, 'NC':37, 'ND': 38, 'OH': 39, 'OK': 40,
'OR': 41, 'PA': 42, 'RI': 44, 'SC': 45, 'SD': 46, 'TN':47,
'TX': 48, 'UT': 49, 'VT': 50, 'VA': 51,
'WA': 53,'WV': 54, 'WI': 55, 'WY': 56, 'PR': 72}

afinnfile = open("AFINN-111.txt")
afinnDict = {}

for line in afinnfile:
    term, score  = line.split("\t")
    afinnDict[term] = int(score)

def getState(data):
    if data["place"] != None and data["place"]["country_code"] == "US":
        state = str(data["place"]["full_name"]).lower().split(", ")
        
        if len(state) > 1:
            return state[1]

def isState(state):
    if state in states:
        return True
    return False 

def analyzeEmotions(tweet):
    termsCount = 0
    score = 0

    for word in tweet:
        strippedWord = word.lower().strip().replace('.','').replace('?','').replace('!','')

        if strippedWord in afinnDict.keys():
            termsCount += 1
            score += afinnDict[strippedWord]
        
    return termsCount, score

def analyzeUsa():
    file = "inp.txt"
    scoredTweets = []
    tweetsByStates = {}
       
    with open(file, "r", encoding='utf-16') as ins:
        
        for line in ins:          
            line = line.strip("'<>() ").replace('\'', '\"')
            line = line.replace("b\"{", "{")
            line = line.replace("}\"", "}")
            line = line.replace("\\\\", "\\")
        
            if len(line) > 1: ## to avoid empty lines 
                tweet = json.loads(line, encoding="utf-16")
           
                if "created_at" in tweet:
                    state = getState(tweet)

                    if isState(state):
                        if "text" in tweet:                           
                            tweetText = tweet["text"]
                            tokenizedTweet = nltk.word_tokenize(tweetText)
                            
                            score = 0
                            numberTerms = 0
                            
                            numberTerms, score = analyzeEmotions(tokenizedTweet)
                            
                            if numberTerms > 0:
                                t = [tweetText, state, score, numberTerms]
                                scoredTweets.append(t)
                                
                                if state in tweetsByStates:
                                    tweetsByStates[state].append(score)
                                else:
                                    tweetsByStates[state] = [score]
    print(tweetsByStates)
    
    with open('out.csv', mode='w',  newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(["state", "id", "meanSent"])
        
        mean = None
        for state in states_names:
            lowerCaseState = state.lower()
            if lowerCaseState in tweetsByStates:
                mean = round(sum(tweetsByStates[lowerCaseState])/len(tweetsByStates[lowerCaseState]),2)
            else: mean = 0
            
            writer.writerow([states_names[state], states_codes[state], mean])

In [11]:
analyzeUsa()

{'ca': [2, 2, 3, 3], 'tx': [6, 7, -6, -3, -1, 2, 2, 3, 2, 4, -2], 'al': [1], 'oh': [-10, 2, 3, 1, 0, -3, -7], 'id': [2], 'ga': [-5, -4, -4, 2, 3], 'or': [2, -9, -4], 'fl': [1, -3, 3, 2, 1, 2, 3], 'ny': [3, -2, 3, -4, 1], 'mn': [1, -5, 1], 'pa': [3], 'nj': [2], 'mi': [-2], 'nd': [-3], 'ok': [4], 'md': [1], 'il': [1], 'tn': [-6], 'ma': [2], 'wa': [-8, 1], 'va': [-1, 4], 'in': [1], 'mo': [-1, -2], 'wv': [5], 'la': [1]}


In [12]:
import json
import csv

def isTweetInSapin(tweet):
    return tweet["place"] != None and tweet["place"]["country_code"] == "ES"
    
def analyzeSpain():
    file = "inpSpain.txt"
    tweetsInSpainByLanguage = {}
       
    with open(file, "r", encoding='utf-16') as ins:
        
        for line in ins:          
            line = line.strip("'<>() ").replace('\'', '\"')
            line = line.replace("b\"{", "{")
            line = line.replace("}\"", "}")
            line = line.replace("\\\\", "\\")
        
            if len(line) > 1: ## to avoid empty lines 
                tweet = json.loads(line, encoding="utf-16")
           
                if "created_at" in tweet:
                    if isTweetInSapin(tweet):
                        coordinates = tweet["place"]["bounding_box"]["coordinates"][0][0]
                        lang = tweet["lang"]

                        if coordinates != None and lang != None:
                            if lang in tweetsInSpainByLanguage:
                                tweetsInSpainByLanguage[lang].append(coordinates)
                            else:
                                tweetsInSpainByLanguage[lang] = [coordinates]
    print(tweetsInSpainByLanguage)
    
    with open('outSpain.csv', mode='w',  newline='') as csv_file:
        writer = csv.writer(csv_file, delimiter=',')
        writer.writerow(["lang", "latitude", "longitude"])

        for lang in tweetsInSpainByLanguage:
            for coord in tweetsInSpainByLanguage[lang]:           
                writer.writerow([lang, coord[1], coord[0]])

In [13]:
analyzeSpain()

{'und': [[2.563782, 39.12507], [-7.005353, 37.107832], [-7.005353, 37.107832]], 'it': [[4.202715, 39.799157]], 'es': [[1.943749, 41.51714], [-6.0201, 43.278867], [-6.306111, 36.516968], [-8.471284, 43.301377], [-15.525504, 28.024813], [-3.889005, 40.312071], [-3.810999, 37.137275], [-3.037149, 43.319954], [2.086323, 41.336062], [-6.02843, 37.313613], [-3.889005, 40.312071], [-6.06368, 37.1481]], 'en': [[0.295386, 40.274335], [2.052477, 41.317048]], 'ca': [[2.446113, 42.150116]], 'tl': [[2.211092, 41.42594]]}


In [6]:
def cleanNotUsefulTweets():
    file = "inp.txt"
    
    linesToKeep = []
    
    with open(file, "r", encoding='utf-16') as ins:
        
        for line in ins:          
            cleanline = line.strip("'<>() ").replace('\'', '\"')
            cleanline = cleanline.replace("b\"{", "{")
            cleanline = cleanline.replace("}\"", "}")
            cleanline = cleanline.replace("\\\\", "\\")
        
            if len(cleanline) > 1:
                tweet = json.loads(cleanline, encoding="utf-16")
           
                if "created_at" in tweet:
                    state = getState(tweet)

                    if isState(state):
                        if "text" in tweet:                           
                            tweetText = tweet["text"]
                            tokenizedTweet = nltk.word_tokenize(tweetText)
                            
                            score = 0
                            numberTerms = 0
                            
                            numberTerms, score = analyzeEmotions(tokenizedTweet)
                            
                            if numberTerms > 0:
                                linesToKeep.append(line)
                                
    with open(file, "w", encoding='utf-16') as ins:
        for line in linesToKeep:
            ins.write(line)

In [7]:
cleanNotUsefulTweets()

In [8]:
def cleanNotUsefulTweetsSpain():
    file = "inpSpain.txt"

    linesToKeep = []
       
    with open(file, "r", encoding='utf-16') as ins:
        
        for line in ins:          
            cleanline = line.strip("'<>() ").replace('\'', '\"')
            cleanline = cleanline.replace("b\"{", "{")
            cleanline = cleanline.replace("}\"", "}")
            cleanline = cleanline.replace("\\\\", "\\")
        
            if len(cleanline) > 1:
                tweet = json.loads(cleanline, encoding="utf-16")
           
                if "created_at" in tweet:
                    if isTweetInSapin(tweet):
                        coordinates = tweet["place"]["bounding_box"]["coordinates"][0][0]
                        lang = tweet["lang"]

                        if coordinates != None and lang != None:
                            linesToKeep.append(line)
                            
    with open(file, "w", encoding='utf-16') as ins:
        for line in linesToKeep:
            ins.write(line)

In [9]:
cleanNotUsefulTweetsSpain()