Team: Tina Peng, Vanessa Zheng 

In [1]:
from plotly.offline import init_notebook_mode, iplot
from IPython.display import display, HTML
import pandas as pd
init_notebook_mode(connected=True)
import pickle
from itertools import product

# Load Data

In [2]:
topic = pd.read_csv('topics.csv', encoding='latin')

In [3]:
member = pd.read_csv('members.csv', encoding='latin')

In [4]:
member = member[['member_id', 'country','joined', 'group_id']]  # subset for memory issue

In [5]:
groups = pd.read_csv('groups.csv', encoding='latin')

In [6]:
groups = groups[['group_id', 'city_id', 'city', 'country', 'created']]

In [7]:
groups_topic = pd.read_csv('groups_topics.csv', encoding='latin')

In [8]:
len(topic.topic_id.unique())

2509

# Prepare data for {topic: groups}

In [9]:
data = pd.merge(groups_topic, groups, on='group_id')

In [10]:
data.head()

Unnamed: 0,topic_id,topic_key,topic_name,group_id,city_id,city,country,created
0,83,sportsfans,Sports Fan,241031,10001,New York,US,2006-05-04 20:06:50
1,10581,social,Social,241031,10001,New York,US,2006-05-04 20:06:50
2,25356,baseball-at-a-sports-bar,Baseball at a sports bar,241031,10001,New York,US,2006-05-04 20:06:50
3,83,sportsfans,Sports Fan,289172,60601,Chicago,US,2006-07-22 23:26:21
4,10451,sports,Sports and Recreation,289172,60601,Chicago,US,2006-07-22 23:26:21


In [11]:
data['created'] = pd.to_datetime(data['created'])  # convert column type for time operation 

In [12]:
data['year'] = data['created'].apply(lambda x: x.year)

get cumulative sum for each year and topic id:

In [13]:
group_count = data.groupby(['topic_id','year']).agg({'group_id':'count'}).reset_index()

group_count.columns = ['topic_id', 'year', 'number_of_join']

topic_group_count = group_count.groupby(by=['topic_id','year']).sum().groupby(level=[0]).cumsum().reset_index()

In [14]:
# pickle.dump(topic_group_count, open('meetups-data-from-meetupcom/data/topic_group_count.pkl', 'wb'))

In [15]:
topic_group_year = topic_group_count.pivot(index='year', columns='topic_id')  
group_count = topic_group_year.stack().reset_index()  # reshape for graphing

In [16]:
group_count[group_count.topic_id==182]

Unnamed: 0,year,topic_id,number_of_join
0,2002,182,1.0
63,2003,182,3.0
167,2005,182,4.0
531,2008,182,6.0
695,2009,182,7.0
904,2010,182,12.0
1133,2011,182,13.0
1394,2012,182,21.0
1695,2013,182,30.0
2047,2014,182,40.0


In [17]:
# pickle.dump(topic_group_year, open('meetups-data-from-meetupcom/data/topic_group_year.pkl', 'wb'))

# Prepare data for {topic: members}

In [18]:
member['joined'] = pd.to_datetime(member['joined'])

In [19]:
member['year'] = member['joined'].apply(lambda x: x.year)

In [20]:
member.head()

Unnamed: 0,member_id,country,joined,group_id,year
0,3,us,2007-05-01 22:04:37,490552,2007
1,3,us,2011-01-23 14:13:17,1474611,2011
2,3,us,2010-12-30 18:47:34,1490492,2010
3,3,us,2011-01-03 14:45:21,1515830,2011
4,3,us,2010-12-30 18:34:50,1574965,2010


Calculate the number of member for each group, then calculate the sum for each topic id:

In [21]:
member_count = member.groupby(['group_id', 'year']).size().reset_index()

In [22]:
member_count.columns = ['group_id', 'year','member_number']

In [23]:
member_group_count = member_count.groupby(by=['group_id','year']).sum().groupby(level=[0]).cumsum().reset_index()

In [24]:
member_count= pd.merge(data, member_group_count, on=['year', 'group_id'])

In [25]:
members = member_count[['topic_id', 'year','member_number']].groupby(['topic_id', 'year']).sum().groupby(level=[0]).cumsum().reset_index()

In [26]:
# pickle.dump(members, open('meetups-data-from-meetupcom/data/members_join_count.pkl', 'wb'))

In [27]:
member_year = members.pivot(index='year', columns='topic_id').fillna(method='ffill')

In [28]:
member_year = member_year.fillna(0)

In [29]:
member_year = member_year.stack().reset_index()

In [30]:
member_year[member_year.topic_id==184]

Unnamed: 0,year,topic_id,member_number
4,2002,184,8.0
860,2003,184,8.0
1716,2004,184,8.0
2572,2005,184,14.0
3428,2006,184,14.0
4284,2007,184,143.0
5140,2008,184,143.0
5996,2009,184,143.0
6852,2010,184,539.0
7708,2011,184,621.0


In [31]:
# pickle.dump(member_year, open('meetups-data-from-meetupcom/data/topic_member_year_filled.pkl', 'wb'))

# Prepare events data:

In [32]:
event = pd.read_csv('events.csv')
groups_topic = pd.read_csv('groups_topics.csv',encoding = "latin")

Join event and group topic on group id:

In [33]:
merged = pd.merge(event,groups_topic,how='left',on='group_id')

merged['created']=pd.to_datetime(merged['created'])

merged['created.yr']=merged['created'].dt.year

Get event_count for every topic_id in every year and cross join all year:

In [34]:
event_count = pd.DataFrame(pd.DataFrame(merged.groupby(['created.yr', 'topic_id'])['event_id'].count()).to_records())

In [35]:
l1 = event_count['created.yr'].unique()

In [36]:
l2 = event_count['topic_id'].unique()

In [37]:
crossjoin = pd.DataFrame(list(product(l1, l2)), columns=['created.yr', 'topic_id'])

In [38]:
crossjoin_event_count = pd.merge(crossjoin, event_count, how='left', on=['created.yr', 'topic_id'])
crossjoin_event_count = crossjoin_event_count.fillna(0)

In [39]:
crossjoin_event_count.head()

Unnamed: 0,created.yr,topic_id,event_id
0,2010,15405.0,12.0
1,2010,15720.0,12.0
2,2010,20301.0,12.0
3,2010,15401.0,0.0
4,2010,22203.0,0.0


In [40]:
events_year = pd.DataFrame(crossjoin_event_count.groupby(['created.yr','topic_id']).sum().groupby(level=[1]).cumsum().to_records())

In [41]:
events_year['topic_id'] = events_year['topic_id'].astype(str)
events_year['topic_id'] = events_year['topic_id'].str.replace('.0','')

In [42]:
events_year.columns = ['year', 'topic_id', 'event_count']
events_year['year'] = events_year.year.astype(int)
events_year['topic_id'] = events_year.topic_id.astype(int)

Merge everything and get final dataset:

In [43]:
final = pd.merge(events_year, group_count, on=['topic_id', 'year'])

In [45]:
topic = pd.read_csv('topics.csv', encoding='latin')
final = pd.merge(final, topic[['topic_id', 'main_topic_id']], on='topic_id')

In [47]:
final = pd.merge(final, member_year, on=['topic_id', 'year'])

In [48]:
final.head()

Unnamed: 0,year,topic_id,event_count,number_of_join,main_topic_id,member_number
0,2010,183,0.0,23.0,10454,596.0
1,2011,183,0.0,24.0,10454,596.0
2,2012,183,0.0,34.0,10454,1852.0
3,2013,183,0.0,40.0,10454,2310.0
4,2014,183,0.0,49.0,10454,2760.0


In [49]:
dataset = final.copy(deep=True)

In [50]:
topic = topic[['topic_id', 'topic_name']]

dataset = pd.merge(dataset, topic, on=['topic_id'])

In [51]:
dataset.head()

Unnamed: 0,year,topic_id,event_count,number_of_join,main_topic_id,member_number,topic_name
0,2010,183,0.0,23.0,10454,596.0,Spanish Language
1,2011,183,0.0,24.0,10454,596.0,Spanish Language
2,2012,183,0.0,34.0,10454,1852.0,Spanish Language
3,2013,183,0.0,40.0,10454,2310.0,Spanish Language
4,2014,183,0.0,49.0,10454,2760.0,Spanish Language


In [52]:
for i in dataset.main_topic_id.unique():
    print(i)
    print(dataset[dataset.main_topic_id==i]['topic_name'].unique())

10454
['Spanish Language' 'French Language' 'Italian Language'
 'Cultural Diversity' 'Cultural Activities' 'Japanese Language'
 'English Language' 'Mandarin Language' 'Culture Exchange' 'Latino Culture'
 'Chinese language' 'European Culture']
223
['Photography' 'Model Photography' 'Fashion Photography'
 'Photography Business']
15992
['Game Development' 'Games' 'Game Design' 'Improv games'
 'Storytelling Games']
19227
['Wine' 'Food and Drink']
10581
['Corporate Social Responsibility' 'Social Networking' 'Social Enterprise'
 'Social Media Marketing' "20's Social" 'Social Justice'
 'Social Entrepreneurship' 'Social, Social, Social' 'Social Marketing']
1924
['Fashion Industry' 'Fashion and Style' 'Restaurant Industry'
 'Fashion Entrepreneurs' 'Fashion Design' 'Clothing and Fashion Designers'
 'Theater Industry']
15018
['Musicals' 'Music Production']
8476
['Education' 'Financial Education' 'Real Estate Investment Education'
 'Higher Education' 'Business Education' 'Trader Education'
 'Inves

In [53]:
def main_topic(x):
    if x == 10454:
        return 'Language & Ethnic Identity'
    elif x == 223:
        return 'Photography'
    elif x == 1502:
        return 'Art & Culture'
    elif x == 1924:
        return 'Fashion & Beauty'
    elif x == 15992:
        return 'Games'
    elif x == 19227:
        return 'Food & Drink'
    elif x == 10581:
        return 'Socializing'
    elif x == 8476:
        return 'Education & Learning'
    elif x == 9696:
        return 'Tech'
    elif x == 15321:
        return 'Literature'
    elif x == 16728:
        return 'Writing'
    elif x == 10099:
        return 'Hobbies & Crafts'
    elif x == 491:
        return 'Dancing'
    elif x == 10050:
        return 'LGBT'
    elif x == 15236:
        return 'Socializing'
    elif x == 1201:
        return 'Book Club'
    elif x == 10451:
        return 'Sports & Recreation'

In [54]:
dataset['main_topic'] = dataset['main_topic_id'].apply(main_topic)

In [55]:
dataset.head()

Unnamed: 0,year,topic_id,event_count,number_of_join,main_topic_id,member_number,topic_name,main_topic
0,2010,183,0.0,23.0,10454,596.0,Spanish Language,Language & Ethnic Identity
1,2011,183,0.0,24.0,10454,596.0,Spanish Language,Language & Ethnic Identity
2,2012,183,0.0,34.0,10454,1852.0,Spanish Language,Language & Ethnic Identity
3,2013,183,0.0,40.0,10454,2310.0,Spanish Language,Language & Ethnic Identity
4,2014,183,0.0,49.0,10454,2760.0,Spanish Language,Language & Ethnic Identity


In [56]:
dataset[dataset.topic_name=='Social Networking']

Unnamed: 0,year,topic_id,event_count,number_of_join,main_topic_id,member_number,topic_name,main_topic
54,2010,4422,0.0,487.0,10581,14164.0,Social Networking,Socializing
55,2011,4422,0.0,612.0,10581,19343.0,Social Networking,Socializing
56,2012,4422,112.0,721.0,10581,25805.0,Social Networking,Socializing
57,2013,4422,112.0,783.0,10581,30240.0,Social Networking,Socializing
58,2014,4422,126.0,936.0,10581,40181.0,Social Networking,Socializing
59,2015,4422,151.0,1144.0,10581,73835.0,Social Networking,Socializing
60,2016,4422,361.0,1376.0,10581,113586.0,Social Networking,Socializing
61,2017,4422,702.0,1848.0,10581,152158.0,Social Networking,Socializing


# Ploting

In [57]:
years = list(dataset.year.unique())
# make list of main_topics
main_topics = []
for main_topic in dataset['main_topic']:
    if main_topic not in main_topics:
        main_topics.append(main_topic)
# make figure
figure = {
    'data': [],
    'layout': {},
    'frames': []
}

# fill in most of layout
figure['layout']['xaxis'] = {'range': [0, 160000.0], 'title': 'Number of Members'}
figure['layout']['yaxis'] = {'range':[0,1000],'title': 'Number of Events'}
figure['layout']['hovermode'] = 'closest'
figure['layout']['sliders'] = {
    'args': [
        'transition', {
            'duration': 400,
            'easing': 'cubic-in-out'
        }
    ],
    'initialValue': '1952',
    'plotlycommand': 'animate',
    'values': years,
    'visible': True
}
figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'quadratic-in-out'}}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}

# make data
year = 2010
for main_topic in main_topics:
    dataset_by_year = dataset[dataset['year'] == year]
    dataset_by_year_and_cont = dataset_by_year[dataset_by_year['main_topic'] == main_topic]

    data_dict = {
        'x': list(dataset_by_year_and_cont['member_number']),
        'y': list(dataset_by_year_and_cont['event_count']),
        'mode': 'markers',
        'text': list(dataset_by_year_and_cont['topic_name']),
        'marker': {
            'sizemode': 'area',
            'sizeref': 5,
            'size': list(dataset_by_year_and_cont['number_of_join'])
        },
        'name': main_topic
    }
    figure['data'].append(data_dict)
    
# make frames
for year in years:
    frame = {'data': [], 'name': str(year)}
    for main_topic in main_topics:
        dataset_by_year = dataset[dataset['year'] == int(year)]
        dataset_by_year_and_cont = dataset_by_year[dataset_by_year['main_topic'] == main_topic]

        data_dict = {
            'x': list(dataset_by_year_and_cont['member_number']),
            'y': list(dataset_by_year_and_cont['event_count']),
            'mode': 'markers',
            'text': list(dataset_by_year_and_cont['topic_name']),
            'marker': {
                'sizemode': 'area',
                'sizeref': 5,
                'size': list(dataset_by_year_and_cont['number_of_join'])
            },
            'name': main_topic
        }
        frame['data'].append(data_dict)

    figure['frames'].append(frame)
    slider_step = {'args': [
        [year],
        {'frame': {'duration': 300, 'redraw': False},
         'mode': 'immediate',
       'transition': {'duration': 300}}
     ],
     'label': year,
     'method': 'animate'}
    sliders_dict['steps'].append(slider_step)

    
figure['layout']['sliders'] = [sliders_dict]

iplot(figure)