In [55]:
import os
import re
import random
import logging
import json
import csv
from pprint import pprint
import gensim
from collections import defaultdict

In [56]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [60]:
# Data location
data_dir = "../../../data/"
models_dir = "../../../models/"
fname = "donorschooseorg_challenge_project-10.csv"

In [61]:
# Class to serve up tagged documents
class TeacherDocs(object):
    
    def __init__(self):
        pass
      
    def __iter__(self):
        # Open the project file
        with open(os.path.join(data_dir, fname)) as csvfile:
            recreader = csv.reader(csvfile, delimiter=',', quotechar='"')
            lc = 0
            for line in recreader:
                if lc == 0:
                    lc+=1
                    continue
                full_doc =  line[2].lower().split() + line[3].lower().split() + line[4].lower().split()
                yield gensim.models.doc2vec.TaggedDocument(full_doc, [line[13]])

In [62]:
# doc2vec params
window = 5
size = 100
bust_cache = True
model_path = os.path.join(models_dir, 'd2v_l_s-{}_w-{}.model'.format(size, window))
# Build the model if it doesnt already exist
if not os.path.exists(model_path) or bust_cache:
    teacherlines = TeacherDocs() 
    model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=5, window=window, iter=25,  workers=12)        
    model.build_vocab(teacherlines)
    model.train(teacherlines, total_examples=model.corpus_count, epochs=model.iter)
    model.save(model_path) 
else:
    model = gensim.models.doc2vec.Doc2Vec.load(model_path) 

2018-11-03 16:44:25,535 : INFO : collecting all words and their counts
2018-11-03 16:44:25,537 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-11-03 16:44:26,653 : INFO : PROGRESS: at example #10000, processed 2688500 words (2410572/s), 56555 word types, 9370 tags
2018-11-03 16:44:27,760 : INFO : PROGRESS: at example #20000, processed 5343915 words (2399283/s), 79542 word types, 17903 tags
2018-11-03 16:44:28,961 : INFO : PROGRESS: at example #30000, processed 8036099 words (2244018/s), 98211 word types, 25219 tags
2018-11-03 16:44:30,256 : INFO : PROGRESS: at example #40000, processed 10742150 words (2090960/s), 114399 word types, 32698 tags
2018-11-03 16:44:31,399 : INFO : PROGRESS: at example #50000, processed 13454648 words (2374007/s), 128518 word types, 39820 tags
2018-11-03 16:44:32,494 : INFO : PROGRESS: at example #60000, processed 16134192 words (2450261/s), 141493 word types, 46221 tags
2018-11-03 16:44:33,617 : INFO : PROGRESS: at exampl

2018-11-03 16:45:18,461 : INFO : EPOCH 2 - PROGRESS: at 50.66% examples, 619919 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:19,466 : INFO : EPOCH 2 - PROGRESS: at 54.05% examples, 620012 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:20,469 : INFO : EPOCH 2 - PROGRESS: at 57.44% examples, 620161 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:21,471 : INFO : EPOCH 2 - PROGRESS: at 60.87% examples, 620594 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:22,480 : INFO : EPOCH 2 - PROGRESS: at 64.22% examples, 619693 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:23,489 : INFO : EPOCH 2 - PROGRESS: at 67.62% examples, 619267 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:24,492 : INFO : EPOCH 2 - PROGRESS: at 71.01% examples, 619749 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:25,497 : INFO : EPOCH 2 - PROGRESS: at 74.35% examples, 620100 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:45:26,499 : INFO : EPOCH 2 - PROGRESS: at 77.75% examples, 620503 words/s,

2018-11-03 16:46:10,966 : INFO : EPOCH 4 - PROGRESS: at 45.60% examples, 639452 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:11,970 : INFO : EPOCH 4 - PROGRESS: at 49.07% examples, 640194 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:12,984 : INFO : EPOCH 4 - PROGRESS: at 53.04% examples, 645553 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:13,985 : INFO : EPOCH 4 - PROGRESS: at 57.17% examples, 652805 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:14,988 : INFO : EPOCH 4 - PROGRESS: at 61.34% examples, 659101 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:15,989 : INFO : EPOCH 4 - PROGRESS: at 65.45% examples, 664026 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:16,995 : INFO : EPOCH 4 - PROGRESS: at 69.58% examples, 668703 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:18,017 : INFO : EPOCH 4 - PROGRESS: at 73.74% examples, 673397 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:46:19,024 : INFO : EPOCH 4 - PROGRESS: at 77.83% examples, 677444 words/s,

2018-11-03 16:47:03,403 : INFO : EPOCH 6 - PROGRESS: at 37.87% examples, 628765 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:04,419 : INFO : EPOCH 6 - PROGRESS: at 41.34% examples, 629312 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:05,423 : INFO : EPOCH 6 - PROGRESS: at 44.89% examples, 631365 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:06,426 : INFO : EPOCH 6 - PROGRESS: at 48.22% examples, 630680 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:07,428 : INFO : EPOCH 6 - PROGRESS: at 51.57% examples, 630220 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:08,435 : INFO : EPOCH 6 - PROGRESS: at 55.08% examples, 630852 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:09,438 : INFO : EPOCH 6 - PROGRESS: at 58.47% examples, 630325 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:10,445 : INFO : EPOCH 6 - PROGRESS: at 61.94% examples, 630495 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:11,451 : INFO : EPOCH 6 - PROGRESS: at 65.48% examples, 631013 words/s,

2018-11-03 16:47:56,221 : INFO : EPOCH 8 - PROGRESS: at 31.31% examples, 634159 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:57,225 : INFO : EPOCH 8 - PROGRESS: at 34.68% examples, 633560 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:58,227 : INFO : EPOCH 8 - PROGRESS: at 38.21% examples, 634455 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:47:59,229 : INFO : EPOCH 8 - PROGRESS: at 41.68% examples, 635249 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:00,238 : INFO : EPOCH 8 - PROGRESS: at 45.19% examples, 636080 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:01,243 : INFO : EPOCH 8 - PROGRESS: at 48.70% examples, 637480 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:02,252 : INFO : EPOCH 8 - PROGRESS: at 52.23% examples, 638167 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:03,261 : INFO : EPOCH 8 - PROGRESS: at 55.69% examples, 637787 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:04,271 : INFO : EPOCH 8 - PROGRESS: at 59.24% examples, 638164 words/s,

2018-11-03 16:48:48,838 : INFO : EPOCH 10 - PROGRESS: at 28.27% examples, 642593 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:49,857 : INFO : EPOCH 10 - PROGRESS: at 31.82% examples, 643782 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:50,873 : INFO : EPOCH 10 - PROGRESS: at 35.33% examples, 643623 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:51,886 : INFO : EPOCH 10 - PROGRESS: at 38.84% examples, 642978 words/s, in_qsize 24, out_qsize 0
2018-11-03 16:48:52,892 : INFO : EPOCH 10 - PROGRESS: at 42.25% examples, 641673 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:53,899 : INFO : EPOCH 10 - PROGRESS: at 45.72% examples, 641531 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:54,907 : INFO : EPOCH 10 - PROGRESS: at 49.22% examples, 642481 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:48:55,913 : INFO : EPOCH 10 - PROGRESS: at 52.74% examples, 642506 words/s, in_qsize 24, out_qsize 0
2018-11-03 16:48:56,914 : INFO : EPOCH 10 - PROGRESS: at 56.19% examples, 642190

2018-11-03 16:49:40,153 : INFO : EPOCH 12 - PROGRESS: at 21.34% examples, 642123 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:41,157 : INFO : EPOCH 12 - PROGRESS: at 24.81% examples, 641225 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:42,170 : INFO : EPOCH 12 - PROGRESS: at 28.27% examples, 640028 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:43,170 : INFO : EPOCH 12 - PROGRESS: at 31.78% examples, 641995 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:44,174 : INFO : EPOCH 12 - PROGRESS: at 35.21% examples, 641398 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:45,179 : INFO : EPOCH 12 - PROGRESS: at 38.77% examples, 642128 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:46,179 : INFO : EPOCH 12 - PROGRESS: at 42.22% examples, 641760 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:47,184 : INFO : EPOCH 12 - PROGRESS: at 46.34% examples, 651331 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:49:48,184 : INFO : EPOCH 12 - PROGRESS: at 50.13% examples, 655331

2018-11-03 16:50:31,765 : INFO : EPOCH 14 - PROGRESS: at 10.65% examples, 642552 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:32,771 : INFO : EPOCH 14 - PROGRESS: at 14.31% examples, 647576 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:33,773 : INFO : EPOCH 14 - PROGRESS: at 17.84% examples, 646882 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:34,793 : INFO : EPOCH 14 - PROGRESS: at 21.46% examples, 645577 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:35,799 : INFO : EPOCH 14 - PROGRESS: at 24.96% examples, 645022 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:36,802 : INFO : EPOCH 14 - PROGRESS: at 28.31% examples, 641439 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:37,810 : INFO : EPOCH 14 - PROGRESS: at 31.82% examples, 642840 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:38,813 : INFO : EPOCH 14 - PROGRESS: at 35.13% examples, 640024 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:50:39,820 : INFO : EPOCH 14 - PROGRESS: at 38.66% examples, 640150

2018-11-03 16:51:25,447 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 16:51:25,450 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 16:51:25,453 : INFO : EPOCH - 15 : training on 23241010 raw words (16542970 effective words) took 27.5s, 602494 effective words/s
2018-11-03 16:51:26,475 : INFO : EPOCH 16 - PROGRESS: at 2.65% examples, 485327 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:51:27,480 : INFO : EPOCH 16 - PROGRESS: at 5.55% examples, 507822 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:51:28,485 : INFO : EPOCH 16 - PROGRESS: at 8.62% examples, 524740 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:51:29,492 : INFO : EPOCH 16 - PROGRESS: at 11.84% examples, 541582 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:51:30,494 : INFO : EPOCH 16 - PROGRESS: at 14.85% examples, 542461 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:51:31,531 : INFO : EPOCH 16 - PROGRESS: at 17.88% examples, 541088 words/s, in_qsize 23, ou

2018-11-03 16:52:20,591 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 16:52:20,593 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 16:52:20,604 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 16:52:20,621 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 16:52:20,624 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 16:52:20,629 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 16:52:20,637 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 16:52:20,638 : INFO : EPOCH - 17 : training on 23241010 raw words (16543812 effective words) took 26.8s, 618132 effective words/s
2018-11-03 16:52:21,684 : INFO : EPOCH 18 - PROGRESS: at 3.22% examples, 574592 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:52:22,696 : INFO : EPOCH 18 - PROGRESS: at 6.92% examples, 619364 words/s, in_qsize 23, out_qs

2018-11-03 16:53:14,079 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-03 16:53:14,084 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-03 16:53:14,088 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 16:53:14,096 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 16:53:14,100 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 16:53:14,118 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 16:53:14,122 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 16:53:14,126 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 16:53:14,127 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 16:53:14,128 : INFO : EPOCH - 19 : training on 23241010 raw words (16544760 effective words) took 26.8s, 618188 effective words/s
2018-11-03 16:53:15,174 : INFO : EPOCH 2

2018-11-03 16:54:08,523 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-11-03 16:54:08,530 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-11-03 16:54:08,535 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-03 16:54:08,558 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-03 16:54:08,570 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-03 16:54:08,583 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 16:54:08,590 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 16:54:08,595 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 16:54:08,604 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 16:54:08,611 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 16:54:08,622 : INFO : worker thread finished; awaiting finish of 1 more threa

2018-11-03 16:55:00,425 : INFO : EPOCH 23 - PROGRESS: at 74.21% examples, 590889 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:01,433 : INFO : EPOCH 23 - PROGRESS: at 77.68% examples, 592977 words/s, in_qsize 24, out_qsize 0
2018-11-03 16:55:02,442 : INFO : EPOCH 23 - PROGRESS: at 81.83% examples, 589437 words/s, in_qsize 23, out_qsize 1
2018-11-03 16:55:03,469 : INFO : EPOCH 23 - PROGRESS: at 87.38% examples, 585837 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:04,493 : INFO : EPOCH 23 - PROGRESS: at 92.78% examples, 582544 words/s, in_qsize 24, out_qsize 0
2018-11-03 16:55:05,496 : INFO : EPOCH 23 - PROGRESS: at 98.36% examples, 580407 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:05,635 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-11-03 16:55:05,644 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-11-03 16:55:05,653 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-03 16:55:05,659 : INFO : wo

2018-11-03 16:55:51,826 : INFO : EPOCH 25 - PROGRESS: at 53.83% examples, 579402 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:52,836 : INFO : EPOCH 25 - PROGRESS: at 57.21% examples, 581568 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:53,840 : INFO : EPOCH 25 - PROGRESS: at 60.60% examples, 583541 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:54,845 : INFO : EPOCH 25 - PROGRESS: at 64.02% examples, 585386 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:55,867 : INFO : EPOCH 25 - PROGRESS: at 67.58% examples, 587600 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:56,881 : INFO : EPOCH 25 - PROGRESS: at 71.12% examples, 590468 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:57,887 : INFO : EPOCH 25 - PROGRESS: at 74.54% examples, 592673 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:58,898 : INFO : EPOCH 25 - PROGRESS: at 77.95% examples, 593982 words/s, in_qsize 23, out_qsize 0
2018-11-03 16:55:59,904 : INFO : EPOCH 25 - PROGRESS: at 82.46% examples, 590763

In [63]:
teachers = defaultdict(list)
with open(os.path.join(data_dir, fname)) as csvfile:
    recreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    lc = 0
    for line in recreader:
        if lc == 0:
            lc+=1
            continue
        teacher_dict = { "title": line[2],
                        "essay1": line[3],
                        "essay2": line[4],
                        "school": line[14],
                        "posted_date": line[6],
                        "funded_date": line[7],
                        "project_id": line[1],
                        
                       }
        teachers[line[13]].append(teacher_dict)

In [64]:
sample_teachers = random.sample(list(teachers.keys()), 5)

for sample_teacher in sample_teachers:
    print("=== {} ===".format(sample_teacher))
    for i in range(len(teachers[sample_teacher])):
        print(" ---- {} ----".format(i))
        print("Project Id: {}".format(teachers[sample_teacher][i]["project_id"]))
        print("Posted: {}".format(teachers[sample_teacher][i]["posted_date"]))
        print("Funded: {}".format(teachers[sample_teacher][i]["funded_date"]))
        print("Title: {}".format(teachers[sample_teacher][i]["title"]))
        print("School: {}".format(teachers[sample_teacher][i]["school"]))
        print("Essay1: {}".format(teachers[sample_teacher][i]["essay1"]))
        print("Essay2: {}".format(teachers[sample_teacher][i]["essay2"]))
        print(" ---- end -- {} ----".format(i))
    sims = model.docvecs.most_similar(sample_teacher, topn=6)[1:] 
    print("---------------------- SIMILAR ------------------------")
    c = 1
    for sim, sim_score in sims:
        print("---------------------- {} ------------------------".format(c))
        print("=== {} - {} ===".format(sim, sim_score))
        for i in range(len(teachers[sim])):
            print(" ---- {} ----".format(i))
            print("Project Id: {}".format(teachers[sim][i]["project_id"]))
            print("Posted: {}".format(teachers[sim][i]["posted_date"]))
            print("Funded: {}".format(teachers[sim][i]["funded_date"]))
            print("Title: {}".format(teachers[sim][i]["title"]))
            print("School: {}".format(teachers[sim][i]["school"]))
            print("Essay1: {}".format(teachers[sim][i]["essay1"]))
            print("Essay2: {}".format(teachers[sim][i]["essay2"]))
            print(" ---- end -- {} ----".format(i))
        sims = model.docvecs.most_similar(sim, topn=5)
        print("---------------------- END - {} ------------------------".format(c))
        c+=1
    print("----------------------  END ALL ------------------------")

2018-11-03 16:56:19,119 : INFO : precomputing L2-norms of doc weight vectors


=== 3ebf292ee8c9ecf3d1b3b5fd1316046e ===
 ---- 0 ----
Project Id: 92129fe3dd8429d4a18a5bdbb2fa3678
Posted: 2016-04-12
Funded: 2016-07-05
Title: Exceptional Students Need Exceptional Learning Materials!
School: a2285168142906406ed747cd7aecc6c1
Essay1: i am a first year special education teacher starting from scratch. my classroom has 6 students with autism, add/adhd, multiple disabilities, behavior deficits, and speech and language impairments. the materials that i am requesting will help enrich my students' learning experience tremendously! 
Essay2: our school is low-income with limited resources that financially can't always provide the extra materials to promote success.my classroom is made up of 6 spunky, very verbal, and curious  children with disabilities in the 3rd and 4th grade who strive to each learn and grow in their own way. in our classroom, each child has their own set of goals and reach those goals using different learning styles. in order to meet the needs of each child,