# Generate stackoverflow tag co-occurance matrix for network construction 

In [1]:
pwd()

'/Users/alexandrarottenkolber/Documents/05_Spatial_Inequalities/The-Technology-Space-and-Digital-Development/code_alexandra'

In [2]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from sklearn.preprocessing import OneHotEncoder

In [3]:
count = 0
for chunk in enumerate(pd.read_csv("../../Alexandra/data/stackoverflow.csv", chunksize=1000)):
    count += 1
    if count == 1: 
        df = chunk[1].copy()
    else: 
        break

In [3]:
df = pd.read_csv("../../Alexandra/data/stackoverflow.csv", low_memory=False)

In [4]:
type(df)

pandas.core.frame.DataFrame

In [5]:
df.head()

Unnamed: 0,_id,id,tags
0,61b778e8fc9434398d505a27,7524441,wpf|visual-studio-2010|enterprise-library
1,61b778e8fc9434398d505a28,2336818,c#|visual-studio|solution|vs-extensibility|vsp...
2,61b778e8fc9434398d505a29,27350559,c#|inheritance|ef-code-first|entity-framework-...
3,61b778e8fc9434398d505a2a,1752242,c#|generics|reflection
4,61b778e8fc9434398d505a2b,28181370,angularjs|select|angular-ui-router|single-page...


In [6]:
no_tag = len(df[df['tags'].apply(lambda x: isinstance(x,float))])
print("There are", no_tag , "observations that do not have tags.")
print("This equals", int(round(no_tag/len(df), 2)*100), "%.")# there are 17362901 observations that do not have tags

There are 17362901 observations that do not have tags.
This equals 72 %.


In [7]:
df = df[df['tags'].notna()]
len(df)

6604020

In [8]:
# find unique tags
splitted_list = [string.split("|") for string in list(df["tags"])]
splitted_list_flat = [str_ for sublist in splitted_list for str_ in sublist]
unique_tags = list(set(splitted_list_flat))
unique_ids = list(set(list(df["_id"])))
len(unique_tags), len(unique_ids)

(50787, 6604020)

In [9]:
# how often does each tag show up? 
tag_counts = Counter(splitted_list_flat)

In [10]:
tag_counts_df = pd.DataFrame.from_dict(tag_counts, orient='index').reset_index().rename(columns = {"index": "tag", 0: "occurence_count"})
tag_counts_df = tag_counts_df.sort_values(by = "occurence_count", ascending=False)
tag_counts_df.head()

Unnamed: 0,tag,occurence_count
101,javascript,679124
87,java,556923
3,c#,549945
102,php,473470
151,android,460460


In [11]:
len(tag_counts_df), len(set(tag_counts_df["tag"]))

(50787, 50787)

In [12]:
# select top 1000 tags
top_1000_tags_df = tag_counts_df.head(1000).copy()
top_1000_tags_ls = list(top_1000_tags_df["tag"])
top_1000_tags_df.head()

Unnamed: 0,tag,occurence_count
101,javascript,679124
87,java,556923
3,c#,549945
102,php,473470
151,android,460460


In [13]:
len(top_1000_tags_df), len(set(list(top_1000_tags_df["tag"])))

(1000, 1000)

In [14]:
splitted_list_filtered = [[x for x in sublist if x in set(top_1000_tags_ls)] for sublist in splitted_list]

In [15]:
len(splitted_list_filtered)

6604020

In [16]:
splitted_list_filtered[10:15]

[['wpf', 'mvvm'],
 ['sql-server', 'foreign-keys', 'indexing'],
 ['wpf'],
 ['wpf', 'popup', 'styles'],
 ['c#', 'wpf', 'focus']]

In [17]:
#splitted_list_filtered[10:15]

In [18]:
# Prepare data to calculate co-coocurence matrix 

# 1 dataframe from tags
# 2 one-hot-encodings
# 3 combine column headers and count 

tag_df = pd.get_dummies(pd.DataFrame(splitted_list_filtered), prefix='', prefix_sep='', sparse=True)#.groupby(level=0, axis=1).sum()
tag_df.head()

Unnamed: 0,.htaccess,.net,.net-3.5,.net-4.0,.net-core,3d,access-vba,actionscript,actionscript-3,active-directory,...,xsd,xslt,yaml,yii,yii2,youtube,youtube-api,zend-framework,zend-framework2,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
tag_df.shape

(6604020, 4998)

In [20]:
cnt = Counter(list(tag_df.columns))
non_unique_labels = [k for k, v in cnt.items() if v > 1 and v != "None" and v != None]
unique_labels = [k for k, v in cnt.items() if v == 1 and v != "None" and v != None]
len(non_unique_labels), len(unique_labels), len(non_unique_labels) + len(unique_labels), 1000-len(set(list(tag_df.columns)))

(1000, 0, 1000, 0)

In [21]:
df=tag_df[unique_labels].copy()
#del(ohe_df)
df.shape

(6604020, 0)

In [24]:
#df.head()

In [27]:
tag_df.columns

Index(['.htaccess', '.net', '.net-3.5', '.net-4.0', '.net-core', '3d',
       'access-vba', 'actionscript', 'actionscript-3', 'active-directory',
       ...
       'xsd', 'xslt', 'yaml', 'yii', 'yii2', 'youtube', 'youtube-api',
       'zend-framework', 'zend-framework2', 'zip'],
      dtype='object', length=4998)

In [30]:
#tag_df['.htaccess']

In [31]:
#del splitted_list

In [43]:
#test_1 = tag_df.iloc[:, : 100].groupby(axis = 'columns', level = 0).sum()

In [None]:
for num, label in enumerate(non_unique_labels[:100]):
    if num % 10 == 0:
        print(num)
    if label != 'None' and label != None: 
        df[label] = tag_df[label].sum(axis=1)
        df = df.copy()
df.shape

0
10
20
30
40
50
60
70
80
90


(6604020, 100)

In [87]:
df0 = pd.DataFrame()
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
df5 = pd.DataFrame()
df6 = pd.DataFrame()
df7 = pd.DataFrame()
df8 = pd.DataFrame()
df9 = pd.DataFrame()

In [88]:
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]
intervals = [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, len(tag_df.columns)]

In [None]:
for i in range(len(frames)): 
    print(i)
    for label in non_unique_labels[intervals[i]:intervals[i+1]]:
        if label != 'None' and label != None: 
            frames[i][label] = tag_df[label].sum(axis=1)
            frames[i] = frames[i].copy()
    frames[i].to_csv(f'../data/intermediate_frames/df{i}.csv')  
    del(frames[i])

0


In [None]:
# read in frames
frames = [df0, df1, df2, df3, df4, df5, df6, df7, df8, df9]
df = pd.DataFrame()
for num, frame in enumerate(frames): 
    intermed_df = pd.read_csv(f'../data/intermediate_frames/df{num}.csv')  
    df = pd.concat([df, intermed_df], axis=1)
    df = df.copy()

In [None]:
result = pd.concat([df1, s1], axis=1)

In [35]:
tag_df

Unnamed: 0,.htaccess,.net,.net-3.5,.net-4.0,.net-core,3d,access-vba,actionscript,actionscript-3,active-directory,...,xsd,xslt,yaml,yii,yii2,youtube,youtube-api,zend-framework,zend-framework2,zip
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6604015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6604016,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6604017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6604018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tag_df_asint = df.astype(int)
coocc = tag_df_asint.T.dot(tag_df_asint)
np.fill_diagonal(coocc.values, 0)
coocc.head()

In [None]:
coocc.shape

In [None]:
# cooccurence matrix to dictionary
coocc_dic = coocc.to_dict()

In [None]:
# create dataframe represnting a network
tag1_ls = []
frames = []

for tag1, tag_dic in coocc_dic.items():
    tag1_ls.append(tag1)
    frames.append(pd.DataFrame.from_dict(tag_dic, orient='index'))

network_df = pd.concat(frames, keys=tag1_ls)
network_df = network_df.reset_index().rename(columns = {"level_0" : "tag1", "level_1" : "tag2", 0: "cooccurence"})

In [None]:
network_df[network_df["cooccurence"] > 10].head(10)

In [None]:
network_df.shape

In [None]:
network_df.to_csv(path_or_buf="../data/undirected_stackoverflow_network_from_raw_data.csv", sep='\t')

In [None]:
unique_labels

In [206]:
#tag_df.to_csv(path_or_buf="../data/tag_df_dummies.csv")

In [207]:
tag_df = tag_df.groupby(level=0, axis=1).sum()

KeyboardInterrupt: 

In [None]:
# Calculate co-coocurence matrix with matrix multiplication

tag_df_asint = tag_df.astype(int)
coocc = tag_df_asint.T.dot(tag_df_asint)
np.fill_diagonal(coocc.values, 0)
coocc.head()

In [None]:
# cooccurence matrix to dictionary
coocc_dic = coocc.to_dict()

In [None]:
# create dataframe represnting a network
tag1_ls = []
frames = []

for tag1, tag_dic in coocc_dic.items():
    tag1_ls.append(tag1)
    frames.append(pd.DataFrame.from_dict(tag_dic, orient='index'))

network_df = pd.concat(frames, keys=tag1_ls)
network_df = network_df.reset_index().rename(columns = {"level_0" : "tag1", "level_1" : "tag2", 0: "cooccurence"})

In [None]:
network_df[network_df["cooccurence"] > 10].head(10)

In [None]:
#network_df.to_csv(path_or_buf="../data/undirected_stackoverflow_network_from_raw_data.csv", sep='\t')

In [29]:
#splitted_list_filtered

In [79]:
listOfDFRows = pd.DataFrame(splitted_list_filtered).to_numpy().tolist()
#listOfDFRows = studentDfObj.to_numpy().tolist()
#listOfDFRows

In [25]:
X = [['Male', 1], ['Female', 3], ['Female', 2]]

In [27]:
X

[['Male', 1], ['Female', 3], ['Female', 2]]

In [None]:
enc.transform([['Female', 1], ['Male', 4]]).toarray()

In [31]:
x = splitted_list_filtered[:10]

In [51]:
x = [np.array(x) for x in splitted_list_filtered[:10]]
#x = np.array(x, dtype=str)

In [47]:
x = x.reshape(-1, 1)

In [48]:
x

array([[array(['wpf', 'visual-studio-2010', 'enterprise-library'], dtype='<U18')],
       [array(['c#', 'visual-studio', 'solution', 'vs-extensibility', 'vspackage'],
              dtype='<U16')                                                        ],
       [array(['c#', 'inheritance', 'ef-code-first', 'entity-framework-6',
               'wcf-data-services'], dtype='<U18')                        ],
       [array(['c#', 'generics', 'reflection'], dtype='<U10')],
       [array(['angularjs', 'select', 'angular-ui-router',
               'single-page-application', 'selecteditem'], dtype='<U23')],
       [array(['wpf', 'triggers', 'treeview', 'datatrigger',
               'hierarchicaldatatemplate'], dtype='<U24')   ],
       [array(['c#', 'winforms', 'button', 'hotkeys'], dtype='<U8')],
       [array(['c#', 'css', 'asp.net-mvc-4', 'azure', 'azure-web-roles'],
              dtype='<U15')                                              ],
       [array(['c#', 'inheritance', 'wcf-data-service

In [100]:
k = pd.DataFrame(splitted_list_filtered).to_numpy().tolist()
X = [['Male', 1], ['Female', 3], ['Female', 2]]

In [124]:
#k

In [101]:
type(X)

list

In [239]:
unique_labels=list(set([str(x) for xs in splitted_list_filtered for x in xs]))
len(unique_labels)

969

In [378]:
enc = OneHotEncoder(categories="auto", sparse=True)

In [379]:
enc.fit(k)

OneHotEncoder()

In [255]:
#enc.categories_

In [256]:
ohe = enc.transform(k).toarray()
ohe.shape

(1000, 1325)

In [231]:
k[:2]

[['wpf', 'visual-studio-2010', 'enterprise-library', None, None],
 ['c#', 'visual-studio', 'solution', 'vs-extensibility', 'vspackage']]

In [232]:
enc.fit(k)

OneHotEncoder()

In [233]:
labels=[str(x) for xs in enc.categories_ for x in xs]
#labels

In [244]:
# implementation with sklearn

y = OneHotEncoder(categories = "auto", sparse=False).fit_transform(k)#.toarray()
print(y)

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [257]:
test_df = pd.DataFrame(y, columns = labels)#.drop(["None"])
#test_df = test_df.drop(["None"], axis=1)

In [258]:
test_df.shape

(1000, 1325)

In [259]:
len(set(test_df.columns)),  len(test_df.columns) # because none is in there 

(970, 1325)

In [340]:
from collections import Counter
cnt = Counter(list(test_df.columns))
non_unique_labels = [k for k, v in cnt.items() if v > 1 and v != "None" and v != None]
unique_labels = [k for k, v in cnt.items() if v == 1 and v != "None" and v != None]
len(non_unique_labels), len(unique_labels)

(259, 711)

In [341]:
#list(test_df.columns)[-1]

In [368]:
df=test_df[unique_labels].copy()
df.shape

(1000, 711)

In [369]:
df.head()

Unnamed: 0,amazon-web-services,android-emulator,angular,antlr,apache,assembly,automake,awk,axis2,boolean,...,sharepoint-feature,slick-3.0,spark-dataframe,splitter,state,struts,url-shortener,visual-studio-2012,word-automation,xceed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [370]:
df[non_unique_labels[2]] = list(test_df[non_unique_labels[2]].sum(axis=1))

In [371]:
for label in non_unique_labels:
    if label != 'None' and label != None: 
        df[label] = test_df[label].sum(axis=1)
df = df.copy()

  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = test_df[label].sum(axis=1)
  df[label] = te

In [372]:
#non_unique_labels

In [373]:
df.shape

(1000, 969)

In [374]:
tag_df_asint = df.astype(int)
coocc = tag_df_asint.T.dot(tag_df_asint)
np.fill_diagonal(coocc.values, 0)
coocc.head()

Unnamed: 0,amazon-web-services,android-emulator,angular,antlr,apache,assembly,automake,awk,axis2,boolean,...,isr,jquery-ui-dialog,karma-jasmine,lessphp,mockery,node-ipc,prototypal-inheritance,spring-boot,vspackage,xamdatagrid
amazon-web-services,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
android-emulator,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
angular,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
antlr,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
apache,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [375]:
# cooccurence matrix to dictionary
coocc_dic = coocc.to_dict()

In [376]:
# create dataframe represnting a network
tag1_ls = []
frames = []

for tag1, tag_dic in coocc_dic.items():
    tag1_ls.append(tag1)
    frames.append(pd.DataFrame.from_dict(tag_dic, orient='index'))

network_df = pd.concat(frames, keys=tag1_ls)
network_df = network_df.reset_index().rename(columns = {"level_0" : "tag1", "level_1" : "tag2", 0: "cooccurence"})

In [377]:
network_df[network_df["cooccurence"] > 10].head(10)

Unnamed: 0,tag1,tag2,cooccurence
30753,java,android,12
691651,ajax,jquery,20
691897,android,java,12
722772,hadoop,mapreduce,12
724594,html,javascript,13
724597,html,jquery,17
728466,javascript,html,13
728473,javascript,jquery,69
731339,jquery,ajax,20
731373,jquery,html,17


In [126]:
# implementation with sklearn

y = OneHotEncoder(categories=unique_tags, 
                  sparse=False, 
                  handle_unknown='error').fit_transform(k[:10]).toarray()
print(y)

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).