### Creating a Single Value for GCconnex usage

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')


In [None]:
#Importing three different datasets
data_path = "/Users/Owner/Documents/Work_transfer/CLSep/Valuesys/Data/"
collcols = ['GUID1', 'GUID2', 'Date']
groupscols = ['GUID1', 'Group', 'Date']
replcols = ['GUID1', 'Topic', 'Date']


In [None]:
colleagues = pd.read_csv(data_path + 'Colleagues.csv', header = None, names = collcols)
groups = pd.read_csv(data_path + 'Groups.csv', header = None, names = groupscols)
replies = pd.read_csv(data_path + 'Replies.csv', header = None, names = replcols)

#Counting how much of what each user has done

colcount = colleagues.groupby('GUID1').count().reset_index()
colcount = colcount[['GUID1', 'GUID2']]
colcount.columns = ['GUID', 'Colleagues']


grpcount = groups.groupby('GUID1').count().reset_index()
grpcount = grpcount[['GUID1', 'Group']]
grpcount.columns = ['GUID', 'Groups']

repcount = replies.groupby('GUID1').count().reset_index()
repcount = repcount[['GUID1', 'Topic']]
repcount.columns = ['GUID', 'Comments']
replies.describe()

In [None]:

#Creating the activity matrix
activity = pd.merge(colcount, grpcount, how = "outer", on = 'GUID')
activity = pd.merge(activity, repcount, how = "outer", on = 'GUID')

activity = activity.fillna(0)
activity.describe()

In [None]:
activity[activity['GUID'] == 10242025]



In [None]:
logcoll = activity['Colleagues'][activity['Colleagues'] > 0].reset_index(drop = True)
logcoll = np.log(logcoll)

loggrp = activity['Groups'][activity['Groups'] > 0].reset_index(drop = True)

loggrp = np.log(loggrp)

logcom = activity['Comments'][activity['Comments'] > 0].reset_index(drop = True)

logcom = np.log(logcom)


In [None]:
logcom

In [None]:
#looking at data is a good thing
x = plt.hist(logcoll, histtype = 'step', label = 'Colleagues', linewidth = '2')


#a very good thing
y = plt.hist(loggrp, histtype = 'step', label = 'Groups', linewidth = '2')


#Martha Stewart
z = plt.hist(logcom, histtype = 'step', label = 'Comments', linewidth = '2')


plt.legend()
plt.title('Distribution of Types of Activity on GCconnex')
plt.xlabel('Log(Value)')
plt.ylabel('Frequency')
plt.show()


In [None]:
#A really interesting thing about this plot is the intersection between colleagues and groups.
#The fact that comments is the smallest of the three is not shocking at all


In [None]:
loggrp.describe()

In [None]:
logs = [logcoll, loggrp, logcom]
logact = pd.DataFrame(logs).T

In [None]:
logact = logact.dropna()

In [None]:
logact.describe() #We're left with 6890 people who have done at least one of everything on the network
#Interesting shit already

## The formula for the score is as follows:
$$Where   c_i = ln(colleagues_i), r_i = ln(comments_i), g_i = ln(groups)_i,$$

$$sumscore_i = c_i + r_i + g_i$$

$$pdtsumscore_i = c_i + r_i + g_i + c_i*r_i + c_i*g_i + r_i*g_i$$



In [None]:
logact['Sum Score'] = logact['Colleagues'] + logact['Comments'] + logact['Groups']
logact['Pdt Sum Score'] = logact['Colleagues'] + logact['Comments'] + logact['Groups'] + logact['Colleagues']*logact['Comments'] +  logact['Colleagues']*logact['Groups'] + logact['Groups']*logact['Comments']

In [None]:
logact.describe()

In [None]:
sumscore = logact['Sum Score'].reset_index(drop = True)
psumscore = logact['Pdt Sum Score'].reset_index(drop = True)

In [None]:
ss = plt.hist(sumscore, linewidth = '2', label = 'Sum Score', histtype = 'step')
pss = plt.hist(psumscore, linewidth = '2', label = 'Pdt Sum Score', histtype = 'step')
plt.legend()
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('BL0CH')
plt.show()

In [None]:
lsumscore = np.log(sumscore[sumscore > 0]).reset_index(drop = True)
lpsumscore = np.log(psumscore[psumscore > 0]).reset_index(drop = True)

lss = plt.hist(lsumscore, linewidth = '2', label = 'Ln(Sum Score)', histtype = 'step')
lpss = plt.hist(lpsumscore, linewidth = '2', label = 'Ln(Pdt Sum Score)', histtype = 'step')
plt.legend()
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.title('BL0CH')
plt.show()

# Initial Discussion on Above

## Here we showed that if you take the logarithm of a Power Law distribution enough, it becomes similar to a normal distributions. Normal Distributions are nice because they are mathematically convenient, but are they reflective of what's going on in the network?

### In this method, while diminishing marginal returns are factored into the equation (which is a fair assumption to make), each of the activities are equally weighted (1 colleague = 1 comment), which is not indicative of the effort in each activity. This method is also very restrictive. We started with nearly 90000 users, and the end rank only shows 9000 users. This means two things, 10% of our users have done 2 of each comment, colleague and group post, and even then, a strong amount of users have not done more than that.

### While the normal distribution is mathematically convenient, it seems that making the assumption of a normal distribution may be a little far fetched

In [None]:
sumscore.describe()

### From 90000 -> 9000. The advantage is that it guts the people who don't have more than two of each action, and therefore would give a poor score anyway.


### An alternative measurement that would consider more individuals in the ranking process would be a simple weighted process of counting how many connections as the proportion of total connections that exist in the system. 
## For example:

# $$comment_i^w = \frac{comment_i}{\Sigma_{j=1}^J(comment_j)} $$


### This would result in very small values, but that's not the issue at the moment. The issue is getting appropriate rankings

In [None]:
#Calculating Sums

colsum = activity['Colleagues'].sum()
comsum = activity['Comments'].sum()
grpsum = activity['Groups'].sum()

#Also log colsums as an experiment


lcolsum = np.log(colsum)
lcomsum = np.log(comsum)
lgrpsum = np.log(grpsum)
print (colsum)
print (comsum)
print (grpsum)
print (" ")

print (lcolsum)
print (lcomsum)
print (lgrpsum)

sumslist = [colsum, comsum, grpsum]
lsumslist = [lcolsum, lcomsum, lgrpsum]
print ("")

for i in sumslist:
    for j in sumslist:
        if i/j <= 1:
            pass
        else:
            print (i/j)
print ("")

for i in lsumslist:
    for j in lsumslist:
        if i/j <= 1:
            pass
        else:
            print (i/j)
            
            
#From the above calculations, it's pretty clear that using logarithms is dumb
       

In [None]:
wcol = activity['Colleagues']/colsum
wcom = activity['Comments']/comsum
wgrp = activity['Groups']/grpsum


weightedactivity = pd.DataFrame([wcol, wcom, wgrp]).T

In [None]:
#for brevity, wa = weightedactivity

wa = weightedactivity


In [None]:
wa['Sum Score'] = wa['Colleagues'] + wa['Comments'] + wa['Groups']
wa['Pdt Sum Score'] = wa['Colleagues'] + wa['Comments'] + wa['Groups'] + wa['Colleagues']*wa['Comments'] +  wa['Colleagues']*wa['Groups'] + wa['Groups']*wa['Comments']

In [None]:
diff = -(wa['Sum Score'] - wa['Pdt Sum Score'])

In [None]:
wa['Sum Score'] *= 1000
wa['Pdt Sum Score'] *= 1000

In [None]:
wa.describe()

In [None]:
plt.hist(np.log(wa['Sum Score']), histtype = 'step')
plt.show()


#We're left with a power law distribution, but that's actually probably not a bad thing

In [None]:
lwa = weightedactivity[['Colleagues', 'Comments', 'Groups']]
lwa.describe()

In [None]:
lwa = lwa.replace(0, 0.00000001)

### Aiming for a more normal distribution follows these assumptions:
### 1. Value from using social networks diminishes the greater you use it. $$u({\lambda}x) < {\lambda}u(x)$$
### 2. There exists an "average" value that represents a majority of the population aka Bell curve in the middle


#### Point 2 has a subtle nuance. It basically means that the mid point of utility from the social network is what the average user experiences. This point is very clearly untrue, so trying to make a normal distribution would make inference to reality difficult.

In [None]:
for i in lwa:

    
    
    lwa[i] = np.log(lwa[i])
    lwa[i] = lwa[i].fillna(0)
    lwa[i] = lwa[i] + abs(min(lwa[i]))

In [None]:
lwa

In [None]:
lwa['Sum Score'] = lwa['Colleagues'] + lwa['Comments'] + lwa['Groups']
lwa['Pdt Sum Score'] = lwa['Colleagues'] + lwa['Comments'] + lwa['Groups'] + lwa['Colleagues']*lwa['Comments'] +  lwa['Colleagues']*lwa['Groups'] + lwa['Groups']*lwa['Comments']

In [None]:
lwa

In [None]:
plt.hist(np.log(lwa['Pdt Sum Score']), histtype = 'step' )
plt.show()