In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import powerlaw
import csv
import os
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from matplotlib.dates import date2num
import networkx as nx


In [2]:
data_path = r'/Users/Owner/Documents/Work_transfer/Data/GCconnex/'
keys_path = r'/Users/Owner/Documents/Work_transfer/Data/GCconnex/Profile Statistics/'

In [3]:
BP = pd.read_csv(data_path+'BP2020 data.csv')

In [4]:
#Making a 'groups' dataset isolates some cool info we can pull from them
BPgroups = BP[BP.department == 'group']

In [5]:
BPgroups.describe()

Unnamed: 0,Id,Weighted Degree,Weighted In-Degree,Weighted Out-Degree,In-Degree,Out-Degree,Degree,Eccentricity,Closeness Centrality,Betweenness Centrality,Eigenvector Centrality,Clustering Coefficient
count,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208.0,3208
mean,5109649.106297,35.829177,34.828242,1.000935,34.828242,1.000935,35.829177,26.769327,9.150913,239650.665809,0.006763,0
std,3708994.522046,152.300325,152.300431,0.030571,152.300431,0.030571,152.300325,7.445872,2.496518,1163880.241701,0.02944,0
min,122.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0
25%,2039915.25,5.0,4.0,1.0,4.0,1.0,5.0,27.0,8.914204,2109.601109,0.00077,0
50%,4785059.0,11.0,10.0,1.0,10.0,1.0,11.0,29.0,9.532337,29382.069259,0.001939,0
75%,7284828.5,25.0,24.0,1.0,24.0,1.0,25.0,29.0,10.262739,145394.263235,0.004702,0
max,13163685.0,5177.0,5176.0,2.0,5176.0,2.0,5177.0,35.0,16.279275,37181843.706094,1.0,0


In [6]:
#Next lines of Code are converting all emails into actual departments.
#It still doesn't mitigate the issue of the canada.ca domain, but it aggregates all the emails into their proper departments
BP['department'] = BP['department'].str.lower()

In [7]:
dept = BP.department
dept_sort = set(dept)
dept_dict = {}

In [8]:
with open(os.path.join(keys_path, "csv_keys.csv"), "r") as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        email, acronym = row
        dept_dict[email] = acronym

In [9]:
dept_dict['cadets.gc.ca'] = 'CADETS'
dept_dict['canada.gc.ca'] = 'CANADA'
dept_dict['canada.ca'] = 'CANADA'
dept_dict['tribunal.gc.ca'] = 'TRIBUNAL'
dept_dict['cannor.gc.ca'] = 'CED/DEC'
dept_dict['ci-oic.gc.ca'] = 'CI/OIC'
dept_dict['ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['god.ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['clo-ocol.gc.ca'] = 'OCOL/CLO'
dept_dict['csps.gc.ca'] = 'CSPS/EFPC'
dept_dict['interenational.gc.ca'] = 'DFAITD/MAECD'
dept_dict['cnb-ncw.gc.ca'] = 'CNB/NCW'
dept_dict['ncw-cnb.gc.ca'] = 'CNB/NCW'
dept_dict['nfb.gc.ca'] = 'NFB/ONF'
dept_dict['nrccan-rncan.gc.ca'] = 'NRCAN/RNCAN'
dept_dict['nserc-crsng.gc.ca'] = 'NSERC/CRSNG'
dept_dict['pbc-clcc.gc.ca'] = 'PBC/CLCC'
dept_dict['pco.bcp.gc.ca'] = 'PCO/BCP'
dept_dict['pipsc.ca'] = 'PIPSC/IPFPC'
dept_dict['ps.sp.gc.ca'] = 'PS/SP'
dept_dict['servicecanada.gc.ca.gc.ca'] = 'HRSDC/RHDSC'
dept_dict['fintrac-canafe.gc.ca'] = 'FINTRAC'



In [10]:
dep = pd.DataFrame.from_dict(dept_dict, orient='index')

In [11]:
BP = BP.replace({'department': dept_dict})

In [12]:
department = BP[BP.department != 'group']
depcount = department['department'].value_counts()
print (len(depcount))


96


In [13]:
#There are 96 different departments registered to BP2020
depcount.to_csv(data_path+'BP2020 Department Count.csv')

In [14]:
department.to_csv(data_path+'BP2020 Users.csv')

In [15]:
#Pulling Back the groups dataset, we can plot all the sorts of distributions of the groups to get some visualizations
#On who BP2020 is connecting to
BPgroups = BPgroups.reset_index()
BPgroups.drop('index', 1, inplace=True)

In [16]:
BPgroupsdegdist = BPgroups['In-Degree']
BPgroupsdegdist = BPgroupsdegdist.value_counts()

In [17]:
BPgroupsdegdist
print (sum(BPgroupsdegdist))
print (len(BPgroups))
if sum(BPgroupsdegdist) == len(BPgroups):
    print ("We're good to go!")
else:
    print ("You did something wrong here")

3208
3208
We're good to go!


In [18]:
#Network simulation of Barabasi Albert Graph
G = nx.barabasi_albert_graph(5500, 3)
G_histo = nx.degree_histogram(G)

In [19]:
#Run this, use the built-in editor to get the format I'm happy with, then export it
plt.scatter(np.arange(len(BPgroupsdegdist)), BPgroupsdegdist, color='r')
plt.scatter(np.arange(len(G_histo)), G_histo)
plt.show()



In [20]:
#Now to test whether or not the function is factually a power-law distribution, or just an exponential
#And now testing for lognormal distributions
groupsfit = powerlaw.Fit(BPgroupsdegdist, discrete=True, xmin=1)
print ("x-fit:")
print (groupsfit.xmin)
print ("Alpha (Distribution Coefficient:)")
print (groupsfit.power_law.alpha)
print ("Fit comparison")
print (groupsfit.distribution_compare('power_law', 'exponential', normalized_ratio=True))
print (groupsfit.distribution_compare('power_law', 'lognormal', normalized_ratio=True))

x-fit:
1.0
Alpha (Distribution Coefficient:)
1.55466228166
Fit comparison
(7.5190536767298175, 5.5174131624906334e-14)
(-2.5984096325114234, 0.0093656693006935795)


In [21]:
#Let's play with this lognormal distribution...
numbers = np.arange(1, len(BPgroupsdegdist) + 1)
lognorm = pd.DataFrame(data = BPgroupsdegdist)

In [22]:
#This is just to illustrate the fitting of the power-law distribution. It fits nicely, but apparently now as nice as lognormal
fig1 = groupsfit.plot_pdf(color='b', linewidth=2)
groupsfit.power_law.plot_pdf(color='b', linestyle='--', ax=fig1)
plt.show()

In [23]:
BPEC = BPgroups.sort('Eigenvector Centrality', ascending=False)

In [24]:
#Identifying the departments that join the most groups proportional to their size
#Group joining is being used as a proxy to involvement in this dataset.
depgrouping = department.groupby('department').mean()
depgrouping.drop('Id', 1, inplace=True)
depgrouping.sort('Degree', inplace=True, ascending=False)


In [25]:
#Identifying the most contributive departments in total
depgroupingsum = department.groupby('department').sum()
depgroupingsum.drop('Id', 1, inplace=True)
depgroupingsum.sort('Degree', inplace=True, ascending=False)



In [26]:
depdegdist = department['Out-Degree']
depdegdist = depdegdist.value_counts()

In [27]:
#Running the powerlaw distribution fit, except this time we see that the fit is not as neat as power-law fit for groups
#In fact, this distribution does not seem to lead to a power-law distribution at all
#More research is going to need to be done in order to gather significant meaning from this
degfit = powerlaw.Fit(depdegdist, discrete=True)
print ("x-fit:")
print (degfit.xmin)
print ("Alpha (Distribution Coefficient:)")
print (degfit.power_law.alpha)
print ("Fit comparison")
print (degfit.distribution_compare('power_law', 'exponential', normalized_ratio=True))
print (degfit.distribution_compare('power_law', 'lognormal', normalized_ratio=True))

x-fit:
3.0
Alpha (Distribution Coefficient:)
1.43082424306
Fit comparison
(2.9514588925473082, 0.0031627664217491426)
(-1.5270101716442854, 0.12675848207731819)


Calculating best minimal value for power law fit


In [28]:
plt.scatter(np.arange(len(depdegdist)), depdegdist)
plt.show()

In [29]:
#Who are the most involved in the network that are joined to Blueprint 2020
#Those who are in many groups have a lot of reach, as groups are the main forum for serious collaborative discussion
#The only issue is because this is a directed graph, tracing the number of shortest paths is not possible
involved = department.sort('Out-Degree', ascending=False)
involved.to_csv(data_path+'Involvedmembers.csv')

In [30]:
#The original graph was run as a directed graph, to differentiate between creating groups and joining groups
#This however does not work for a correlation analysis of out-degree and betweenness centrality, so a new graph must be imported
bc = pd.read_csv(data_path+'BP2020 BC.csv')
bc = bc[bc.department != 'group']

In [31]:
x = bc['Weighted Out-Degree']
y = bc['Betweenness Centrality']

In [32]:
#When we plot Betweenness Centrality and Out-Degree we get a confirmation that the best situated members are those who
#Have joined and been involved with many groups
plt.scatter(y,x)
plt.show()

In [33]:
#And because visually confirming relationships is a serious no-no, we have a regression that also says the same thing
#Intuitively, visually and empirically we can say that the best people to target are the ones who 
model = smf.ols(formula = 'y ~ x', data = bc)
results = model.fit()
print (results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.713
Model:                            OLS   Adj. R-squared:                  0.713
Method:                 Least Squares   F-statistic:                 1.288e+04
Date:                Mon, 02 Nov 2015   Prob (F-statistic):               0.00
Time:                        14:37:34   Log-Likelihood:                -69977.
No. Observations:                5176   AIC:                         1.400e+05
Df Residuals:                    5174   BIC:                         1.400e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      -7e+04   3326.411    -21.044      0.0

In [34]:
#With this, we can say that the people who join the most groups are very likely to be the most central ones in the network
#In the context of this analysis, this means we can say that people who join the most groups likely have the best reach
#Obviously this makes perfect sense 

In [35]:
#Comparing a distribution of groups joined with the rest of the network
comp = pd.read_csv('/Users/Owner/Documents/Work_transfer/Data/Report Card/Anonindstats.csv')
gjcomp = comp['Groups Joined'].value_counts()
gjcomp = gjcomp[1:]
plt.plot(gjcomp)
plt.show()

In [36]:
gjresult = powerlaw.Fit(gjcomp, discrete=True)
print ("x-fit:")
print (gjresult.xmin)
print ("Alpha (Distribution Coefficient:)")
print (gjresult.power_law.alpha)
print ("Fit comparison")
print (gjresult.distribution_compare('power_law', 'exponential', normalized_ratio=True))
print (gjresult.distribution_compare('power_law', 'lognormal', normalized_ratio=True))

x-fit:
4.0
Alpha (Distribution Coefficient:)
1.36853843016
Fit comparison
(4.847905698428022, 1.2477168315602152e-06)
(-0.91205953043798649, 0.36173738063069161)


Calculating best minimal value for power law fit


In [37]:
#Pulling data of all the comments made in the network, filtering out only for people in the Blueprint Group
commentcsv = pd.read_excel('/Users/Owner/Documents/Work_transfer/Data/Report Card/comments.xlsx')
comments = commentcsv[['owner_guid', 'string']]
comments.columns = ['Id', 'String']
commentcount = comments.groupby('Id').count()
commentcount.reset_index(inplace=True)
commentcount = commentcount.convert_objects(convert_numeric = True)

In [38]:

depdegree = department[['Id', 'In-Degree', 'Out-Degree']]
idbt = pd.merge(depdegree, commentcount, how='outer', on = 'Id')
print (len(idbt))

9976


In [39]:
#The logic behind this is that idbt is a table that consists of of the amount of comments every user in BP2020 hsa made
#and the amount of groups that each member joined, for the purposes of a regression analysis
idbt = idbt.dropna(subset=['Out-Degree'], how='all') #If the group member did had an out-degree of NaN it's because that 
#individual was not in the original BP2020 group member dataset, so they were not part of the group. So we drop them.
idbt = idbt.fillna(0)
idbt.columns = ['User GUID', 'In-Degree', 'Out-Degree', 'Comments']
print (len(idbt))
idbt.to_csv(data_path+'Commenters.csv')

5176


PermissionError: [Errno 13] Permission denied: '/Users/Owner/Documents/Work_transfer/Data/GCconnex/Commenters.csv'

In [None]:
commenters = idbt[idbt['Comments'] > 0]
print ("Of the 5176 members of Blueprint 2020, only", len(commenters), "Members have at least one comment. That is only",(len(commenters)/len(idbt))*100,"% of the group")
idbt['Comments'].sum()

In [None]:
x = idbt['Out-Degree']
y= idbt['Comments']
z = idbt['In-Degree']
model = smf.ols(formula = 'y ~ x + z', data = idbt)
results = model.fit()
print (results.summary())
#The amount of comments a person makes is correlated with group involvement, but not incredibly so
#The R-Squared value shows that this is not the best predictor there ever was

In [None]:
#But if we remove the 73.8% of people who don't comment
idbt1 = idbt[idbt['Comments'] > 0]
x = idbt1['Out-Degree']
y= idbt1['Comments']
z = idbt1['In-Degree']
model = smf.ols(formula = 'y ~ x + z', data = idbt1)
results = model.fit()
print (results.summary())
#If we only factor in those who HAVE commented, then we see that creating groups
# Group creating and the intercept are not statistically significant.

In [None]:
#This means that the biggest, most central players within BP2020 and the GCconnex itself are the ones with
#the most groups joined. It gives them the most exposure throughout the network, and generally can mean that they are
#involved

#But with this new-found knowledge, we can start saying for sure that the most important people throughout the network
#Are the ones who have many different groups joined.



In [None]:
plt.scatter(x, y)
plt.plot(x, np.poly1d(np.polyfit(x,y,1))(x), color='r')
plt.plot(y, np.poly1d(np.polyfit(y,x,1))(y), color='g')
plt.show()

In [40]:
idbt.sort('Comments', inplace=True, ascending=False)

In [41]:
#Let's not stop the party here though... We can go deeper..

groups = pd.read_csv(data_path+'Group Members GCconnex.csv')
colleagues = pd.read_csv(data_path+'Colleagues GCconnex.csv')
BP2020_members = groups[(groups['Group GUID'] == 272967)]
datetime = pd.date_range('2013-05-15', periods = 900, freq='D')
datetime

DatetimeIndex(['2013-05-15', '2013-05-16', '2013-05-17', '2013-05-18',
               '2013-05-19', '2013-05-20', '2013-05-21', '2013-05-22',
               '2013-05-23', '2013-05-24', 
               ...
               '2015-10-22', '2015-10-23', '2015-10-24', '2015-10-25',
               '2015-10-26', '2015-10-27', '2015-10-28', '2015-10-29',
               '2015-10-30', '2015-10-31'],
              dtype='datetime64[ns]', length=900, freq='D', tz=None)

In [42]:
memberlist = BP2020_members['User GUID'].tolist()
bpcolleaguesnw = colleagues[(colleagues['UID1'].isin(memberlist)) & (colleagues['UID2'].isin(memberlist))]

In [43]:
#Within the web of the 5300+ employees, there are 19574  colleague
print(len(bpcolleaguesnw))
bpcolleagues1 = colleagues[(colleagues['UID1'].isin(memberlist))]
bpcolleagues2 = colleagues[(colleagues['UID2'].isin(memberlist))]
collist = [bpcolleagues1, bpcolleagues2]
bpcolleagues = pd.concat(collist)
bpcolleagues.drop_duplicates(inplace = True)
print (len(bpcolleagues))

19574
83111


In [44]:
#We have to do a little more massaging with the data
#The issue with the colleagues dataset is that it tends to double count friend lists (because 1 -> 2, then to accept 2 -> 1)
#These next few cells runs through the datasets, and deletes the double counting in the dataset

#It's all blocked atm because the code is woefully inefficient and it takes forever to run!
#The results are printed in the code file

#swag = bpcolleaguesnw[['UID1', 'UID2']].values.tolist()
#print (swag[0:5])
#a = [[0, 1], [1, 0], [1, 2], [2, 1], [2, 3], [3, 2], [3, 4], [4, 3], [5, 6], [6, 7], [7, 8], [8, 7]]
#print (a)
#for x in a:
#    y = [x[1], x[0]]
#    if y in a:
#        a.remove(x)
#print (a)

#for x in swag:
#    y = [x[1], x[0]]
#    if y in swag:
#        swag.remove(x)

In [45]:
#print (len(swag))
#for x in swag:
#    y = [x[1], x[0]]
#    if y in swag:
#        swag.remove(x)
#print (len(swag))
#for x in swag:
#    y = [x[1], x[0]]
#    if y in swag:
#        swag.remove(x)

In [46]:
#This means that there are 13141 connections between blueprint 2020 members

In [47]:
#bpcolleaguesfixed = pd.DataFrame(swag)
#bpcolleaguesfixed
#bpcolleaguesfixed.columns = ['UID1', 'UID2']


In [48]:
#We have to do a little more massaging with the data
#swagger = colleagues[['UID1', 'UID2']].values.tolist()
#print (swag[0:5])


#for x in swagger:
#    y = [x[1], x[0]]
#    if y in swagger:
#        swagger.remove(x)

In [49]:
#swaggy = bpcolleagues[['UID1', 'UID2']].values.tolist()

#for x in swaggy:
#    y = [x[1], x[0]]
#    if y in swaggy:
#        swaggy.remove(x)


In [50]:
#print (len(swagger))
#print (len(swag)/len(swagger))
#print (len(swaggy)/len(swagger))

#Number of Colleague Matches is 127673 (instead of 187000)
#Percentage of BP2020 colleagues between bp2020 members only is 10.29%
#Percentage of colleague links with at least one BP2020 member is 45.60%

In [51]:
test = len(bpcolleagues[bpcolleagues.duplicated() == True])
print (test)
print (len(bpcolleagues))
print (len(colleagues))
print (len(bpcolleagues)/len(colleagues))
#If my coding is right, then BP2020 members make up 44 percent of all colleague requests
#I'll check this later

0
83111
187781
0.442595363748196


In [52]:
#Looking at growth of members over time
corruptentries = BP2020_members[BP2020_members['Date'] == '1969-12-31']
print (len(corruptentries))
if len(corruptentries) == 0:
    print ("Awww yis, clean data")
else:
    print ("Of course, we can't escape corruption :(")

0
Awww yis, clean data


In [53]:
BP2020_members['Date'] = pd.to_datetime(BP2020_members['Date'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [54]:
regdate = BP2020_members.groupby(BP2020_members.Date).count()
regdate = regdate.reindex(datetime, fill_value=0)


In [55]:
regdate['Member'] = regdate['Member'].cumsum()
regdate.drop('Group GUID', 1, inplace=True)

In [56]:
columns = ['Registrations', 'Cumulative Sum']
regdate.columns = columns


In [94]:
#Plotting the size of blueprint 2020 over time
line = plt.plot_date(datetime, regdate['Cumulative Sum'], linewidth=1, antialiased=False, linestyle='-')
plt.xlabel('Date')
plt.ylabel('Group Members')
plt.title('Blueprint 2020 Group Growth')
sns.plotting_context(context="notebook")
sns.set_style("darkgrid")
plt.show()


In [149]:
#Doing the same thing with colleagues would be inaccurate because of untimely database corruptions
#Plus I don't believe it would be that helpful in any case

In [74]:
colldate = bpcolleagues[bpcolleagues['Date'] != '1969-12-31']
colldate['Date'] = pd.to_datetime(bpcolleagues['Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [75]:
colldate = colldate.groupby('Date').count()
colldate = colldate.reindex(datetime, fill_value=0)

In [76]:
colldate['Friend'] = colldate['Friend'].cumsum()

In [77]:
colldate.drop('UID2', 1, inplace=True)

In [90]:
Group_Size = plt.plot_date(datetime, regdate['Cumulative Sum'], antialiased=True)
Colleague_Requests = plt.plot_date(datetime, colldate['Friend'], antialiased=True)
plt.xlabel('Date')
plt.ylabel('Number of Actions')
plt.legend()
plt.show()



In [69]:
groupslist = groups.groupby('Group GUID').count()
print ("There are", len(groupslist),"groups in GCconnex")
groupslist.sort('Member', ascending=False)
bp2020grouplist = groups[groups['User GUID'].isin(memberlist)]

There are 5864 groups in GCconnex


In [70]:
bp2020grouplist = bp2020grouplist.groupby('Group GUID').count()


In [71]:
bp2020grouplist.sort('Member', ascending=False)
bp2020grouplist.columns = ['1','2','3']

In [72]:
bp2020grouplist = bp2020grouplist.join(groupslist, how='left', on=None)

In [73]:
bp2020grouplist = bp2020grouplist[['1', 'User GUID']]
bp2020grouplist.columns = ['BPMembers', 'Group Members']


In [74]:
#Yeah this doesn't give very good data.
bp2020grouplist = bp2020grouplist.sort('BPMembers', ascending=False)
bp2020grouplist['Percentage'] = bp2020grouplist['BPMembers']/bp2020grouplist['Group Members']*100
bp2020grouplist.sort('Percentage', ascending=False)
bplist = bp2020grouplist[bp2020grouplist['BPMembers'] > 50]
bplist = bplist.sort('Percentage', ascending=False)


In [77]:
#In the report, I make a reference to the percentage of users who have not made a comment in BP2020 and compare it
#to the network as a whole. The issue though, is that there is a sampling bias. If someone registered, and did not joing a group
#then they are much less likely to make a comment anywhere. To control for this, we are going to do a conditional
#probability analysis, where I am going to obtain the probability that a user makes a comment given that they have joined a group
# This is noted as P(Comment | Joined a group) which is equal to P(commented and joined a group)/P(joined a group)
#This is effectively just controlling for the fact in BP2020, people have joined a group,whereas in the network, that's not
#always the case

#The issue here is that for some weird reason the data is inconsistent.

In [125]:
#Making a table to start calculating probabilities
groups_joined = comp[['User GUID','Groups Joined']]
comments_ = commentcount 
comments_.columns = ['User GUID', 'Comments'] 
probdf = pd.merge(groups_joined, comments_, on='User GUID', how='outer')
probdf = probdf.dropna(axis=0, how='any', subset=['User GUID'])
probdf = probdf.fillna(0) #Those who didn't have a comment were not in the table of comments, so fillna makes them zero
print (len(probdf))

72815


In [126]:
#Probability they have joined a group
pg = len(probdf[probdf['Groups Joined'] >= 1])/len(probdf)
#probability that they have made a comment
pc = len(probdf[probdf['Comments'] >= 1])/len(probdf)
#Probability that they've done both
pgc = len(probdf[(probdf['Comments'] >= 1) & (probdf['Groups Joined'] >= 1)])/len(probdf)

pcgiveng = pgc/pg
print ("The probability that a random user has joined a group is",pg)
print ("The probability that a random user has made a comment is",pc)
print ("The probability that a random user has done both is",pgc)
print ("The probability that a user comments given that they've joined a group is",pcgiveng)

The probability that a random user has joined a group is 0.6327542401977615
The probability that a random user has made a comment is 0.08439195220764953
The probability that a random user has done both is 0.08025818856004945
The probability that a user comments given that they've joined a group is 0.1268394322177367


In [139]:
print (len(probdf[probdf['Groups Joined'] == 0]))
print (len(probdf[probdf['Comments'] == 0]))
print (len(probdf))

26741
66670
72815
