<a href="https://colab.research.google.com/github/Trek3/stacksample-amd/blob/main/stacksample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!kaggle datasets files stackoverflow/stacksample
!kaggle datasets download stackoverflow/stacksample -p .kaggle/ -f Questions.csv
!cd .kaggle
!unzip Questions.csv.zip

name           size  creationDate         
-------------  ----  -------------------  
Tags.csv       62MB  2019-10-08 00:37:26  
Answers.csv     1GB  2019-10-08 00:37:26  
Questions.csv   2GB  2019-10-08 00:37:26  
Questions.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
unzip:  cannot find or open Questions.csv.zip, Questions.csv.zip.zip or Questions.csv.zip.ZIP.


In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

with open('.kaggle/Questions.csv', 'rb') as f:
  df = pd.read_csv(f, sep=',', usecols=['Id','Body'], encoding='ISO-8859-1', index_col=0).head(10)

df

  mask |= (ar1 == a)


Unnamed: 0_level_0,Body
Id,Unnamed: 1_level_1
80,<p>I've written a database generation script i...
90,<p>Are there any really good tutorials explain...
120,<p>Has anyone got experience creating <strong>...
180,<p>This is something I've pseudo-solved many t...
260,<p>I have a little game written in C#. It uses...
330,<p>I am working on a collection of classes use...
470,<p>I've been writing a few web services for a ...
580,<p>I wonder how you guys manage deployment of ...
650,<p>I would like the version property of my app...
810,<p>I'm trying to maintain a Setup Project in <...


In [22]:
import random as rd
import functools

def build_hashrow_functions(n, k):
  ixs = list(range(n))
  rd.shuffle(ixs)
  return [lambda x, i=i : (x + i) % k for i in ixs]

def shingle(k, body):
  return [body[i:i+k] for i in range(len(body))]

def get_all_shingles(k):
  return sorted(set(functools.reduce(lambda a, b : a+b, [shingle(k, df.iloc[i]['Body']) for i in range(len(df))])))

def sim(s1, s2):
  return sum([1 if x1 == x2 else 0 for (x1, x2) in zip(s1, s2)])/(len(s1))

def sim_shingles(s1, s2):
  pairs = list(zip(s1, s2))
  x = sum([(x1 & x2) for (x1, x2) in pairs])
  y = sum([(x1 ^ x2) for (x1, x2) in pairs])
  if x + y == 0:
    return 0.0
  return x / (x + y)

def build_characteristic_matrix(df, k = 6, shingles_per_column = None, all_shingles = None):
  if all_shingles is None:
    all_shingles = get_all_shingles(k)
  if shingles_per_column is None:
    shingles_per_column = [shingle(k, df.iloc[i]['Body']) for i in range(len(df))]

  return [[1 if shin in col_i else 0 for col_i in shingles_per_column] for shin in all_shingles]

def build_signature_matrix(df, k = 6, char_matrix = None, functions = None, number_of_functions = 100):
  if char_matrix is None:
    char_matrix = build_characteristic_matrix(df, k)

  if functions is None:
    functions = build_hashrow_functions(number_of_functions, len(char_matrix))
  sig_matrix = [[len(char_matrix) + 1 for _ in range(len(char_matrix[0]))] for _ in range(len(functions))]

  for i in range(len(char_matrix)):
    indexes = [f(i) for f in functions]
    for j in range(len(char_matrix[i])):
      if char_matrix[i][j] == 1:
        for l in range(len(sig_matrix)):
          sig_matrix[l][j] = min(sig_matrix[l][j], indexes[l])

  return pd.DataFrame(sig_matrix), pd.DataFrame(char_matrix)

def build_similarity_matrix(sig_df):
  return pd.DataFrame([[sim(sig_df[col1], sig_df[col2]) for col2 in sig_df] for col1 in sig_df])

def build_similarity_matrix_from_shingles(char_df):
  return pd.DataFrame([[sim_shingles(char_df[col1], char_df[col2]) for col2 in char_df] for col1 in char_df])

In [36]:
signature, characteristic = build_signature_matrix(df, k=4, number_of_functions=300)

build_similarity_matrix(signature)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.013333,0.016667,0.026667,0.05,0.123333,0.103333,0.07,0.023333,0.03
1,0.013333,1.0,0.033333,0.136667,0.066667,0.076667,0.116667,0.046667,0.066667,0.02
2,0.016667,0.033333,1.0,0.123333,0.056667,0.1,0.026667,0.033333,0.073333,0.043333
3,0.026667,0.136667,0.123333,1.0,0.063333,0.09,0.123333,0.04,0.0,0.02
4,0.05,0.066667,0.056667,0.063333,1.0,0.12,0.076667,0.083333,0.193333,0.13
5,0.123333,0.076667,0.1,0.09,0.12,1.0,0.07,0.13,0.096667,0.096667
6,0.103333,0.116667,0.026667,0.123333,0.076667,0.07,1.0,0.086667,0.073333,0.116667
7,0.07,0.046667,0.033333,0.04,0.083333,0.13,0.086667,1.0,0.083333,0.1
8,0.023333,0.066667,0.073333,0.0,0.193333,0.096667,0.073333,0.083333,1.0,0.193333
9,0.03,0.02,0.043333,0.02,0.13,0.096667,0.116667,0.1,0.193333,1.0


In [34]:
build_similarity_matrix_from_shingles(characteristic)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.015954,0.015716,0.012731,0.054185,0.023127,0.022346,0.024138,0.014728,0.016608
1,0.015954,1.0,0.018838,0.014344,0.022065,0.023853,0.026071,0.022698,0.007261,0.014851
2,0.015716,0.018838,1.0,0.025316,0.017544,0.022562,0.02924,0.021251,0.026484,0.03209
3,0.012731,0.014344,0.025316,1.0,0.017268,0.024816,0.028037,0.013505,0.004141,0.034512
4,0.054185,0.022065,0.017544,0.017268,1.0,0.052718,0.015015,0.046655,0.049763,0.050698
5,0.023127,0.023853,0.022562,0.024816,0.052718,1.0,0.032628,0.028558,0.02381,0.049943
6,0.022346,0.026071,0.02924,0.028037,0.015015,0.032628,1.0,0.021916,0.023952,0.026358
7,0.024138,0.022698,0.021251,0.013505,0.046655,0.028558,0.021916,1.0,0.022422,0.034467
8,0.014728,0.007261,0.026484,0.004141,0.049763,0.02381,0.023952,0.022422,1.0,0.047941
9,0.016608,0.014851,0.03209,0.034512,0.050698,0.049943,0.026358,0.034467,0.047941,1.0


In [15]:
crm = [[1,0,0,1],[0,0,1,0],[0,1,0,1],[1,0,1,1],[0,0,1,0]]
funs = [lambda x : (x + 1) % 5, lambda x : (3*x + 1) % 5]

sig, crm = build_signature_matrix(None, k=4, char_matrix=crm, functions=funs)

build_similarity_matrix(sig)

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.5,1.0
1,0.0,1.0,0.0,0.0
2,0.5,0.0,1.0,0.5
3,1.0,0.0,0.5,1.0
