In [44]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [45]:
%matplotlib inline


In [46]:
def Open_File(file_name):
    '''Function takes one input argument, a JSON-file,corresponding to the saved file-name,for each park, 
    from the result of the request to Twitters API and converts the JSON-file to a dictionary.   
    IN: file_name(JSON-object).
    OUT: Tweet_Data (Dictionary), Dictionary of the JSON-file.'''
    
    with open(file_name) as file:
        Tweet_Data= json.load(file)

    return Tweet_Data

In [47]:
def ToDataframe_text(Tweet_Data):
    '''  Function takes a dictionary corresponding to the JSON-file for each park, Extracts the "Text" data and returns
    a Pandas DataFrame with one column,Tweet/Text-data, were each row represents one unique Tweet.
    IN: Tweet_Data(Dictionary).
    OUT: DataFrame, with one column, Tweet/text-data.'''
    
    Df_list=[]
    for item in Tweet_Data['statuses']:
        if item['truncated']==True:
            Df_list.append(item['extended_tweet']['full_text'])
        else:
            Df_list.append(item['text'])
    pd.set_option('display.max_colwidth',-1)
    Df= pd.DataFrame(data=Df_list,columns=['Tweet'])
    return Df

'uncomment below rows for quick visualization'

#Tweet_Data=Open_File('TwitterData_Json'+'/Hagaparken_Tweet15-19.json')
#df_text= ToDataframe_text(Tweet_Data)

'uncomment below rows for quick visualization'

In [48]:
#importing pythons regular expression library.
import re
def clean_text_round1(text):
    ''' Funtion takes one input argument,the tweet column in our Dataframe,and removes URL-references and special characters
    that doesnt influence the sentiment such att #,numbers and /(see below) using pythons regular expression library. 
    Function returns a cleaned DataFramed. 
    IN: DataFranme, Tweet(text)-data column.
    OUT: DataFrame, cleaned from URL-references and special characters.'''
    
    text=re.sub(r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)','',text)
    text=re.sub(r'@\S+ ', r'', text)
    text= re.sub(r'[@0-9.#/"?;():-]', '', text)
    return text


'uncomment below rows for quick visualization'
#round1=lambda tweet:clean_text_round1(tweet)
#df = pd.DataFrame(df_text['Tweet'].apply(round1))
#df.head(20)

'uncomment below rows for quick visualization'

In [49]:
import re
from nltk.corpus import stopwords
def clean_emoji(tweet):
    
    tweet=tweet.lower()
    tweet= re.sub(r'[^a-ö \s]','',tweet)
    tweet= re.sub(r'[\n]',' ',tweet)
     
    return tweet



In [50]:
# First filtering
from functools import reduce
# 'Stockholms',Kungsholmen,rålis,län,sthlm,mälarstrand

def remove(tweet):
    patternsToRemove = [
        r'[Ss]tockholm(s)?',
        r'[Kk]ungsholmen',
        r'[Rr]ålis?',
        r'[Ss]thlm',
        r'[Ll]än',
        r'[Mm]älarstrand',
        r'[Kk]om(mer)?'
    ]
    
    return reduce(lambda tweet, pattern: re.sub(pattern, '', tweet), patternsToRemove, tweet)
    
    
'uncomment below rows for quick visualization'  
#data_clean['Tweet']=data_clean['Tweet'].apply(lambda tweet: remove(tweet))
#data_clean.head(10)


'uncomment below rows for quick visualization'

In [51]:
# filtering common words
#data_clean=pd.read_pickle('filter_2Sk')

# filtering more words
filter_list=['hela','år','idag','norr','runt','mer','bakom','inför','få','ena','ska','fick','vill','lite','kör',
             'tack','går','vill']


def filter(tweet):
    tweet=tweet.lower()
    tweet_split= tweet.split(' ')
    
    for word in filter_list:
        if word in tweet_split:
            tweet_split.remove(word)

    
    if ' '.join(tweet_split):
        return ' '.join(tweet_split)
    
    else:
        return 'tom'
    
#data_clean['Tweet']= data_clean['Tweet'].apply(lambda tweet: filter(tweet))


In [52]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from gensim import matutils, models
import scipy.sparse
import logging

''' The function below runs all the previous steps and returns the variables "id2word","coprus" which are going to be 
input variables in the lda-model in next cell. The function also returns the variable "df_dtm" which represents the cleaned
dataframe.'''

def data_toDtm():

    # Reading Tweet-data
    Tweet_Data = Open_File('../TwitterData_Json/Rålambshovsparken_Tweet15-19.json')
    df_text = ToDataframe_text(Tweet_Data)
    
    # cleaning round 1
    round1 = lambda tweet:clean_text_round1(tweet)
    df = pd.DataFrame(df_text['Tweet'].apply(round1))

    # removing all characters that isnt a alphabetic letter.
    df['Tweet'] = df['Tweet'].apply(lambda tweet: clean_emoji(tweet))

    
    # Removing common expressed words.
    df['Tweet'] = df['Tweet'].apply(lambda tweet: remove(tweet))
    df['Tweet'] = df['Tweet'].apply(lambda tweet: filter(tweet))
 
    # Delete these row indexes from dataFrame
    del_rows= df[df['Tweet']=='tom'].index
    df.drop(del_rows , inplace=True)
    
    
    # In order to Implement the LDA-algoritm we Transform the DataFrame column Tweet-data to a Document Term Matrix
    cv = CountVectorizer(max_df=0.9,min_df=0.0025) 
    data_cv = cv.fit_transform(df['Tweet'])
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = df.index

    # pickle the data
    pickle.dump(cv,open('cv.pkl_rå','wb'))

    #Required input for topic modeling is a term-document matrix
    #Which is the transpose of a DTM
    # columns=Tweets, rows= word 
    tdm=data_dtm.transpose()
    
    # putting the term-document matrix into a new gensim format, sparse matrix for efficency.
    sparse_counts = scipy.sparse.csr_matrix(tdm)
    corpus = matutils.Sparse2Corpus(sparse_counts)

    # Gensim also requires dictionary of all the terms and their respective location in the tdm
    cv = pickle.load(open('cv.pkl_rå','rb'))
    id2word = dict((v,k) for k, v in cv.vocabulary_.items())
    
    return id2word, corpus, df

# Topic Modeling

In [55]:
'Output from the function each lda-model takes the variables "id2word" and "corpus" as input.'
id2word,corpus,df_dtm = data_toDtm()
# 3 topics
#lda= models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3,passes=50)
#lda.print_topics()

  del sys.path[0]


## Identified Topics
### topic 1: Crime
### topic 2: Holidays celebration (National day),midsummer
### topic 3: Konserts

In [56]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import matplotlib.colors as mcolors
'''
stop_words= stopwords.words('swedish')
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(num_words=20,formatted=False)

fig, axes = plt.subplots(1, 3, figsize=(18,6), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()'''

#fig.savefig('Rålambshovsparken_wordcloud.pdf')

"\nstop_words= stopwords.words('swedish')\ncols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'\n\ncloud = WordCloud(stopwords=stop_words,\n                  background_color='white',\n                  width=2500,\n                  height=1800,\n                  max_words=20,\n                  colormap='tab10',\n                  color_func=lambda *args, **kwargs: cols[i],\n                  prefer_horizontal=1.0)\n\ntopics = lda.show_topics(num_words=20,formatted=False)\n\nfig, axes = plt.subplots(1, 3, figsize=(18,6), sharex=True, sharey=True)\n\nfor i, ax in enumerate(axes.flatten()):\n    fig.add_subplot(ax)\n    topic_words = dict(topics[i][1])\n    cloud.generate_from_frequencies(topic_words, max_font_size=300)\n    plt.gca().imshow(cloud)\n    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))\n    plt.gca().axis('off')\n\n\nplt.subplots_adjust(wspace=0, hspace=0)\nplt.axis('off')\nplt.margins(x=0, y=0)\nplt.tight

In [40]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=df_dtm['Tweet']):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus, texts=df_dtm['Tweet'])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
#df_dominant_topic.head(10)

In [41]:

# Top 3 Keywords for each Topic
topic_top3words = [(i, topic) for i, topics in lda.show_topics(formatted=False) 
                                 for j, (topic, wt) in enumerate(topics) if j < 3]

df_top3words_stacked = pd.DataFrame(topic_top3words, columns=['topic_id', 'words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(', \n'.join)
df_top3words.reset_index(level=0,inplace=True)


In [43]:
'''from matplotlib.ticker import FuncFormatter
sns.set(rc={'figure.figsize':(20,10)})
sns.set_palette('deep')
fig,ax_count= plt.subplots(figsize=(12,10))

#Countplot All Sentiment
ax_count=sns.countplot(x='Dominant_Topic',data=df_dominant_topic)
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])
ax_count.xaxis.set_major_formatter(tick_formatter)

ax_count.set_xlabel('Topic',fontsize=18) 
ax_count.set_ylabel('Number of Tweets',fontsize=18)
ax_count.set_title('Number of Tweets by Dominant Topic',fontsize=20)

#fig.savefig('Rålambshovsparken_Dominanttopic.pdf')'''


"from matplotlib.ticker import FuncFormatter\nsns.set(rc={'figure.figsize':(20,10)})\nsns.set_palette('deep')\nfig,ax_count= plt.subplots(figsize=(12,10))\n\n#Countplot All Sentiment\nax_count=sns.countplot(x='Dominant_Topic',data=df_dominant_topic)\ntick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top3words.loc[df_top3words.topic_id==x, 'words'].values[0])\nax_count.xaxis.set_major_formatter(tick_formatter)\n\nax_count.set_xlabel('Topic',fontsize=18) \nax_count.set_ylabel('Number of Tweets',fontsize=18)\nax_count.set_title('Number of Tweets by Dominant Topic',fontsize=20)\n\n#fig.savefig('Rålambshovsparken_Dominanttopic.pdf')"