In [52]:
import pandas as pd
import numpy as np
import os
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
pd.set_option('display.max_rows', 500)

## Reference

- https://medium.com/@deangelaneves/how-to-build-a-search-engine-from-scratch-in-python-part-1-96eb240f9ecb

In [13]:
df = pd.read_csv("data/2020-02-27_student-repos.csv")
df = df[df["content"] != "not read"]
df

Unnamed: 0.1,Unnamed: 0,repo_name,repo_full_name,file_name,size,path,url,encoding,content
0,0,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,README.md,11495,README.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,b'# DSCI 542: Communication and Argumentation\...
8,8,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,lab1_solutions.ipynb,16855,lab presentations and solutions/lab1_solutions...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,"b'{\n ""cells"": [\n {\n ""cell_type"": ""markdo..."
9,9,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,lab3_example.ipynb,16479,lab presentations and solutions/lab3_example.i...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,"b'{\n ""cells"": [\n {\n ""cell_type"": ""markdo..."
10,10,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,report.md,7633,lab presentations and solutions/lab4_example/r...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,b'### Making Dinner Plans With Data Science\n\...
13,13,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,lecture1.md,7761,lectures/lecture1/lecture1.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,"b""DSCI 542: Communication and Argumentation\n=..."
...,...,...,...,...,...,...,...,...,...
1976,1976,DSCI_574_spat-temp-mod_students,MDS-2019-20/DSCI_574_spat-temp-mod_students,lab1.md,13972,labs/release/lab1/lab1.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,b'DSCI 574 Lab1\n================\nMDS 2019/20...
1985,1985,DSCI_553_stat-inf-2_students,MDS-2019-20/DSCI_553_stat-inf-2_students,README.md,3273,README.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,b'# DSCI 553: Statistical Inference and Comput...
1987,1987,DSCI_553_stat-inf-2_students,MDS-2019-20/DSCI_553_stat-inf-2_students,lab1.ipynb,23360,release/lab1/lab1.ipynb,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,"b'{\n ""cells"": [\n {\n ""cell_type"": ""markdo..."
1988,1988,DSCI_553_stat-inf-2_students,MDS-2019-20/DSCI_553_stat-inf-2_students,act1.ipynb,33796,release/lec1/act1.ipynb,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,"b'{\n ""cells"": [\n {\n ""cell_type"": ""markdo..."


In [19]:
def text_preprocess(x):
    regex = re.compile('[^a-zA-Z ]')
    x = regex.sub('', x)
    x = x.lower()
    return x

In [20]:
df["content_clean"] = df.loc[:,"content"].apply(text_preprocess)
df["content_clean"]

0       b dsci  communication and argumentationnn todo...
8       bn cells n  n   celltype markdownn   metadata ...
9       bn cells n  n   celltype markdownn   metadata ...
10      b making dinner plans with data sciencennthere...
13      bdsci  communication and argumentationnnauthor...
                              ...                        
1976    bdsci  labnnmds  block  instructor tomas beuze...
1985    b dsci  statistical inference and computation ...
1987    bn cells n  n   celltype markdownn   metadata ...
1988    bn cells n  n   celltype markdownn   metadata ...
1991    bn cells n  n   celltype markdownn   metadata ...
Name: content_clean, Length: 552, dtype: object

In [22]:
tfid_vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,4))
X = tfid_vectorizer.fit_transform(df["content_clean"])

pd.DataFrame(data=X.toarray(), columns=tfid_vectorizer.get_feature_names())

In [69]:
search_keys = "mle pandas"
query = "mle pandas"
search_query_weights = tfid_vectorizer.transform(search_keys)
search_query_weights.toarray()

ValueError: Iterable over raw text documents expected, string object received.

In [67]:
def train_model(X_train):
    tfid_vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,4))
    X_train_weights = tfid_vectorizer.fit_transform(X_train)
    return (tfid_vectorizer, X_train_weights)

In [70]:
tfid_vectorizer, X_train_weights = train_model(df["content"])

In [73]:
def find_query_weights(X_query, tfid_vectorizer):
    X_query = text_preprocess(X_query)
    X_query_weights = tfid_vectorizer.transform([X_query])
    return X_query_weights

In [75]:
X_query_weights = find_query_weights("MLE pandas", tfid_vectorizer)
X_query_weights

mle pandas


<1x4617429 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [44]:
def cos_similarity(X_query_weights, X_train_weights):
    cosine_distance = cosine_similarity(X_query_weights, X_train_weights)
    similarity_list = cosine_distance[0]
    return similarity_list

In [76]:
sim_list = cos_similarity(X_query_weights, X_train_weights)
sim_list[0:4]

array([0., 0., 0., 0.])

In [78]:
def most_similar(similarity_list, min_talks=4):
    most_similar= []
    while min_talks > 0:
        tmp_index = np.argmax(similarity_list)
        most_similar.append(tmp_index)
        similarity_list[tmp_index] = 0
        min_talks -= 1
    return most_similar

In [79]:
most_similar(sim_list)

[436, 427, 271, 272]

In [60]:
df.iloc[436,]

Unnamed: 0                                                     1555
repo_name                              DSCI_552_stat-inf-1_students
repo_full_name             MDS-2019-20/DSCI_552_stat-inf-1_students
file_name            08_lecture-maximum-likelihood-estimation.ipynb
size                                                         256274
path              previous-lectures/08_lecture-maximum-likelihoo...
url               https://github.ubc.ca/api/v3/repos/MDS-2019-20...
encoding                                                     base64
content           b'{\n "cells": [\n  {\n   "cell_type": "markdo...
content_clean     bn cells n  n   celltype markdownn   metadata ...
Name: 1555, dtype: object

In [59]:
df.iloc[436,]["url"]

'https://github.ubc.ca/api/v3/repos/MDS-2019-20/DSCI_552_stat-inf-1_students/contents/previous-lectures/08_lecture-maximum-likelihood-estimation.ipynb?ref=master'

In [64]:
df.iloc[303,]


Unnamed: 0                                                      888
repo_name                                  DSCI_562_regr-2_students
repo_full_name                 MDS-2019-20/DSCI_562_regr-2_students
file_name                                                 README.md
size                                                           4235
path                                   lectures/2018/lec4/README.md
url               https://github.ubc.ca/api/v3/repos/MDS-2019-20...
encoding                                                     base64
content           b'# Lecture 4\n\n## Agenda\n\n- Re-address the...
content_clean     b lecture nn agendann readdress the table from...
Name: 888, dtype: object