In [1]:
import os
import re
import random
import logging
import json
import csv
from pprint import pprint
import gensim
from collections import defaultdict

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# Data location
data_dir = "../../../data/"
models_dir = "../../../models/"
fname = "donorschooseorg_challenge_project-10.csv"

In [4]:
teachers = defaultdict(list)
with open(os.path.join(data_dir, fname)) as csvfile:
    recreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    lc = 0
    for line in recreader:
        if lc == 0:
            lc+=1
            continue
        teacher_dict = { "title": line[2],
                        "essay1": line[3],
                        "essay2": line[4],
                        "school": line[14],
                        "posted_date": line[6],
                        "funded_date": line[7],
                        "project_id": line[1],
                        "subj_cat": line[9],
                        "subj_subcat": line[10],
                        "grade_cat": line[11],
                        "resc_type": line[12],
                       }
        teachers[line[13]].append(teacher_dict)

In [9]:
# Class to serve up tagged documents
class TeacherDocs(object):
    
    def __init__(self):
        pass
      
#     def __iter__(self):
#         # Open the project file
#         with open(os.path.join(data_dir, fname)) as csvfile:
#             recreader = csv.reader(csvfile, delimiter=',', quotechar='"')
#             lc = 0
#             for line in recreader:
#                 if lc == 0:
#                     lc+=1
#                     continue
#                 full_doc =  line[2].lower().split() + line[3].lower().split() + line[4].lower().split()
#                 yield gensim.models.doc2vec.TaggedDocument(full_doc, [line[13]])
    
    def __iter__(self):
        # Open the project file
        for teacher in teachers:
            all_text = []
            for rec in teachers[teacher]:
                text_piece = rec["title"].strip().lower().split() + rec["essay1"].strip().lower().split() + rec["essay2"].strip().lower().split()
                text_piece.append("_".join(rec["subj_cat"].strip().lower().split()))
                text_piece.append("_".join(rec["subj_subcat"].strip().lower().split()))
                text_piece.append("_".join(rec["grade_cat"].strip().lower().split()))
                text_piece.append("_".join(rec["resc_type"].strip().lower().split()))
            all_text += text_piece    
            
            yield gensim.models.doc2vec.TaggedDocument(all_text, [teacher])

In [10]:
# doc2vec params
window = 5
size = 100
bust_cache = True
model_path = os.path.join(models_dir, 'd2v_l_s-{}_w-{}.model'.format(size, window))
# Build the model if it doesnt already exist
if not os.path.exists(model_path) or bust_cache:
    teacherlines = TeacherDocs() 
    model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=5, window=window, iter=25,  workers=12)        
    model.build_vocab(teacherlines)
    model.train(teacherlines, total_examples=model.corpus_count, epochs=model.iter)
    model.save(model_path) 
else:
    model = gensim.models.doc2vec.Doc2Vec.load(model_path) 

2018-11-03 20:08:44,678 : INFO : collecting all words and their counts
2018-11-03 20:08:44,679 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-11-03 20:08:45,747 : INFO : PROGRESS: at example #10000, processed 2625720 words (2459502/s), 55995 word types, 10000 tags
2018-11-03 20:08:46,803 : INFO : PROGRESS: at example #20000, processed 5190779 words (2431846/s), 79358 word types, 20000 tags
2018-11-03 20:08:47,779 : INFO : PROGRESS: at example #30000, processed 7818262 words (2694814/s), 99106 word types, 30000 tags
2018-11-03 20:08:48,697 : INFO : PROGRESS: at example #40000, processed 10475720 words (2896290/s), 115555 word types, 40000 tags
2018-11-03 20:08:49,721 : INFO : PROGRESS: at example #50000, processed 13076439 words (2541470/s), 130462 word types, 50000 tags
2018-11-03 20:08:50,491 : INFO : PROGRESS: at example #60000, processed 15398656 words (3019650/s), 142565 word types, 60000 tags
2018-11-03 20:08:50,966 : INFO : PROGRESS: at examp

2018-11-03 20:09:29,656 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 20:09:29,666 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 20:09:29,672 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 20:09:29,683 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 20:09:29,686 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 20:09:29,694 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 20:09:29,695 : INFO : EPOCH - 2 : training on 16974218 raw words (12131588 effective words) took 18.4s, 658781 effective words/s
2018-11-03 20:09:30,727 : INFO : EPOCH 3 - PROGRESS: at 4.47% examples, 581830 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:09:31,737 : INFO : EPOCH 3 - PROGRESS: at 9.64% examples, 629404 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:09:32,748 : INFO : EPOCH 3 - PROGRESS: at 14.76% examples, 647151 words/s,

2018-11-03 20:10:16,369 : INFO : EPOCH 5 - PROGRESS: at 44.40% examples, 650969 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:17,369 : INFO : EPOCH 5 - PROGRESS: at 49.76% examples, 657704 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:18,374 : INFO : EPOCH 5 - PROGRESS: at 54.60% examples, 657201 words/s, in_qsize 24, out_qsize 0
2018-11-03 20:10:19,381 : INFO : EPOCH 5 - PROGRESS: at 59.59% examples, 657210 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:20,402 : INFO : EPOCH 5 - PROGRESS: at 65.08% examples, 661278 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:21,406 : INFO : EPOCH 5 - PROGRESS: at 70.51% examples, 665164 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:22,419 : INFO : EPOCH 5 - PROGRESS: at 75.85% examples, 667707 words/s, in_qsize 24, out_qsize 0
2018-11-03 20:10:23,425 : INFO : EPOCH 5 - PROGRESS: at 81.01% examples, 668406 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:10:24,430 : INFO : EPOCH 5 - PROGRESS: at 89.51% examples, 665093 words/s,

2018-11-03 20:11:05,214 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-03 20:11:05,216 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-03 20:11:05,228 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 20:11:05,241 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 20:11:05,244 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 20:11:05,251 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 20:11:05,265 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 20:11:05,270 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 20:11:05,272 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 20:11:05,273 : INFO : EPOCH - 7 : training on 16974218 raw words (12133483 effective words) took 20.0s, 606237 effective words/s
2018-11-03 20:11:06,300 : INFO : EPOCH 8 

2018-11-03 20:11:49,013 : INFO : EPOCH 10 - PROGRESS: at 35.53% examples, 665624 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:50,017 : INFO : EPOCH 10 - PROGRESS: at 40.86% examples, 670792 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:51,028 : INFO : EPOCH 10 - PROGRESS: at 46.33% examples, 677417 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:52,035 : INFO : EPOCH 10 - PROGRESS: at 51.67% examples, 680940 words/s, in_qsize 24, out_qsize 0
2018-11-03 20:11:53,038 : INFO : EPOCH 10 - PROGRESS: at 57.24% examples, 686615 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:54,057 : INFO : EPOCH 10 - PROGRESS: at 62.73% examples, 688707 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:55,062 : INFO : EPOCH 10 - PROGRESS: at 68.20% examples, 691145 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:56,072 : INFO : EPOCH 10 - PROGRESS: at 73.73% examples, 693574 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:11:57,085 : INFO : EPOCH 10 - PROGRESS: at 79.27% examples, 696026

2018-11-03 20:12:34,757 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 20:12:34,759 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 20:12:34,778 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-11-03 20:12:34,784 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-11-03 20:12:34,789 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-11-03 20:12:34,789 : INFO : EPOCH - 12 : training on 16974218 raw words (12133887 effective words) took 17.4s, 695833 effective words/s
2018-11-03 20:12:35,804 : INFO : EPOCH 13 - PROGRESS: at 4.63% examples, 613482 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:12:36,813 : INFO : EPOCH 13 - PROGRESS: at 10.22% examples, 673826 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:12:37,818 : INFO : EPOCH 13 - PROGRESS: at 15.58% examples, 687537 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:12:38,824 : INFO : EPOCH 13 - PROGRESS: at 20.

2018-11-03 20:13:22,147 : INFO : EPOCH 15 - PROGRESS: at 63.52% examples, 699076 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:23,147 : INFO : EPOCH 15 - PROGRESS: at 69.01% examples, 701093 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:24,148 : INFO : EPOCH 15 - PROGRESS: at 74.32% examples, 701275 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:25,165 : INFO : EPOCH 15 - PROGRESS: at 79.64% examples, 701210 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:26,190 : INFO : EPOCH 15 - PROGRESS: at 86.96% examples, 694091 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:27,191 : INFO : EPOCH 15 - PROGRESS: at 94.98% examples, 687740 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:13:27,699 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-11-03 20:13:27,705 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-11-03 20:13:27,713 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-03 20:13:27,730 : INFO : wo

2018-11-03 20:14:03,774 : INFO : EPOCH 18 - PROGRESS: at 4.63% examples, 612387 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:04,777 : INFO : EPOCH 18 - PROGRESS: at 10.22% examples, 675481 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:05,786 : INFO : EPOCH 18 - PROGRESS: at 15.41% examples, 681090 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:06,794 : INFO : EPOCH 18 - PROGRESS: at 20.78% examples, 685337 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:07,806 : INFO : EPOCH 18 - PROGRESS: at 26.16% examples, 684886 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:08,822 : INFO : EPOCH 18 - PROGRESS: at 31.54% examples, 687513 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:09,823 : INFO : EPOCH 18 - PROGRESS: at 36.95% examples, 692474 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:10,825 : INFO : EPOCH 18 - PROGRESS: at 42.50% examples, 697989 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:11,829 : INFO : EPOCH 18 - PROGRESS: at 47.76% examples, 699086 

2018-11-03 20:14:55,419 : INFO : EPOCH 20 - PROGRESS: at 91.36% examples, 709499 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:14:56,328 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-11-03 20:14:56,347 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-11-03 20:14:56,349 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-03 20:14:56,365 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-03 20:14:56,380 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-03 20:14:56,388 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 20:14:56,392 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 20:14:56,399 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 20:14:56,401 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-11-03 20:14:56,415 : INFO : worker thread finished; awaitin

2018-11-03 20:15:36,608 : INFO : EPOCH 23 - PROGRESS: at 25.89% examples, 674155 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:37,612 : INFO : EPOCH 23 - PROGRESS: at 31.22% examples, 678722 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:38,625 : INFO : EPOCH 23 - PROGRESS: at 36.69% examples, 684921 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:39,626 : INFO : EPOCH 23 - PROGRESS: at 42.01% examples, 687764 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:40,631 : INFO : EPOCH 23 - PROGRESS: at 47.46% examples, 692315 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:41,631 : INFO : EPOCH 23 - PROGRESS: at 52.76% examples, 694805 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:42,644 : INFO : EPOCH 23 - PROGRESS: at 58.08% examples, 695480 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:15:43,647 : INFO : EPOCH 23 - PROGRESS: at 63.36% examples, 695353 words/s, in_qsize 24, out_qsize 0
2018-11-03 20:15:44,654 : INFO : EPOCH 23 - PROGRESS: at 68.90% examples, 697807

2018-11-03 20:16:28,170 : INFO : EPOCH 25 - PROGRESS: at 91.66% examples, 635406 words/s, in_qsize 23, out_qsize 0
2018-11-03 20:16:29,221 : INFO : EPOCH 25 - PROGRESS: at 98.42% examples, 626259 words/s, in_qsize 17, out_qsize 0
2018-11-03 20:16:29,333 : INFO : worker thread finished; awaiting finish of 11 more threads
2018-11-03 20:16:29,336 : INFO : worker thread finished; awaiting finish of 10 more threads
2018-11-03 20:16:29,338 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-11-03 20:16:29,343 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-11-03 20:16:29,347 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-11-03 20:16:29,357 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-11-03 20:16:29,375 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-11-03 20:16:29,380 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-11-03 20:16:29,385 : INFO : worker 

In [12]:
sample_teachers = random.sample(list(teachers.keys()), 5)

for sample_teacher in sample_teachers:
    print("=== {} ===".format(sample_teacher))
    for i in range(len(teachers[sample_teacher])):
        print(" ---- {} ----".format(i))
        print("Project Id: {}".format(teachers[sample_teacher][i]["project_id"]))
        print("Posted: {}".format(teachers[sample_teacher][i]["posted_date"]))
        print("Funded: {}".format(teachers[sample_teacher][i]["funded_date"]))
        print("Title: {}".format(teachers[sample_teacher][i]["title"]))
        print("School: {}".format(teachers[sample_teacher][i]["school"]))
        print("Essay1: {}".format(teachers[sample_teacher][i]["essay1"]))
        print("Essay2: {}".format(teachers[sample_teacher][i]["essay2"]))
        print(" ---- end -- {} ----".format(i))
    sims = model.docvecs.most_similar(sample_teacher, topn=6)[1:] 
    print("---------------------- SIMILAR ------------------------")
    c = 1
    for sim, sim_score in sims:
        print("---------------------- {} ------------------------".format(c))
        print("=== {} - {} ===".format(sim, sim_score))
        for i in range(len(teachers[sim])):
            print(" ---- {} ----".format(i))
            print("Project Id: {}".format(teachers[sim][i]["project_id"]))
            print("Posted: {}".format(teachers[sim][i]["posted_date"]))
            print("Funded: {}".format(teachers[sim][i]["funded_date"]))
            print("Title: {}".format(teachers[sim][i]["title"]))
            print("School: {}".format(teachers[sim][i]["school"]))
            print("Essay1: {}".format(teachers[sim][i]["essay1"]))
            print("Essay2: {}".format(teachers[sim][i]["essay2"]))
            print(" ---- end -- {} ----".format(i))
        sims = model.docvecs.most_similar(sim, topn=5)
        print("---------------------- END - {} ------------------------".format(c))
        c+=1
    print("----------------------  END ALL ------------------------")

=== ad1b2b616801661d16865cbcefcd5720 ===
 ---- 0 ----
Project Id: 89dae807c6b0e0df2d7b82055c94fe8a
Posted: 2018-06-08
Funded: 2018-06-17
Title: I Can See the Reader in Me!
School: f3b12691f6bc7a98fd09f61d625a1f51
Essay1: my students are very unique and diverse. they are a wonderful bunch of 3rd graders who have a thirst for learning and are very inquisitive and enthusiastic. it is my goal as an educator to help them grow and explore as readers. 

books have the power to instill confidence, grit, and resilience to all readers. they have the power to change us and allow us to understand the world around us. i truly want to foster a love for reading and compassion in my students by providing them with high interest books.


Essay2: books are powerful! they allow students to see themselves or look into the world around them. finding the just right book can change a reluctant reader into an avid reader. my goal with this project is to provide my 3rd graders with books where they are able to