In [1]:
%matplotlib inline

import argparse
from utils import ipyth_utils_par
from utils.ipyth_ck12_training_class import *
import csv
import numpy as np
import pandas as pd
import codecs
import time
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import scale, LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import cross_validation
from sklearn.ensemble import AdaBoostClassifier
import matplotlib.pyplot as plt

from multiprocessing import Pool
import gc
gc.enable()



Some useful functions

In [2]:
#urls  to get toppics
ck12_url_topic = ['https://www.ck12.org/earth-science/', 'http://www.ck12.org/life-science/', 
                  'http://www.ck12.org/physical-science/', 'http://www.ck12.org/biology/', 
                  'http://www.ck12.org/chemistry/', 'http://www.ck12.org/physics/',
                  'http://www.ck12.org/astronomy/','http://www.ck12.org/history/',
                  ]

wiki_docs_dir = '../data/wiki_data'
N_WORKERS = 1

def get_wiki_docs():
    # get keywords 
    ck12_keywords = set()
    for url_topic in ck12_url_topic:
        keywords= ipyth_utils_par.get_keyword_from_url_topic(url_topic)
        for kw in keywords:
            ck12_keywords.add(kw)
    
    #get and save wiki docs
    utils.get_save_wiki_docs(ck12_keywords, wiki_docs_dir)

class ck12_predict_loc(ck12_predict_cl):
    pass


def evaluate_score (y_model, y_real):
    model_score = sum(y_model==y_real)/len(y_real)
    return model_score

In [3]:
## Input parameters
parser = argparse.ArgumentParser()

fname_str = 'joined_set.tsv'
docs_per_q = 10
get_wiki_data = 0  # put 1 if you want to download wiki data

In [4]:
## READING THE DATA

if get_wiki_data:
    print("run: parse wiki docs from urls")
    get_wiki_docs()
    
print("run: reading csv")    
#read data
data = pd.read_csv('../data/' + fname_str, sep = '\t')

print("Data collected")

run: reading csv
Data collected


In [5]:
## BUILDING TF-IDF MODEL

print("Building TF-idf model")
start_time = time.time()
ck12_prediction = ck12_predict_loc ()
ck12_prediction.tf_idf_dict(wiki_docs_dir, n_gram = 3, workers=N_WORKERS)
print ("tf-idf collected")
print ("elapsed time: ",round((time.time()-start_time)/60,2))

#ck12_prediction.docs_tf

Building TF-idf model
running tf_idf
calculating tf-idf: 
tf-idf collected
elapsed time:  1.5


In [6]:
## Checking Steamer

print (ipyth_utils_par.tokenize(data.iloc[0,1].strip(" "),ngram=1, do_stem= False),"\n")
print (ipyth_utils_par.tokenize(data.iloc[0,1].strip(" "),ngram=1, do_stem= True))
print (ipyth_utils_par.tokenize(data.iloc[0,1].strip(" "),ngram=1))
file_path = u"..\data\wiki_data\predation_in_life_science.txt"

#print (ipyth_utils.tokenize(open(file_path).read(),ngram=1, do_steam= False)[0:100])
#print (ipyth_utils.tokenize(open(file_path).read(),ngram=1, do_steam= True)[0:100])

##ck12_prediction.docs_tf['predation_in_life_science.txt']

['athletes', 'begin', 'exercise', 'heart', 'rates', 'respiration', 'rates', 'increase', 'level', 'organization', 'human', 'body', 'coordinate', 'functions'] 

['athlet', 'begin', 'exercis', 'heart', 'rate', 'respir', 'rate', 'increas', 'level', 'organ', 'human', 'bodi', 'coordin', 'function']
['athlet', 'begin', 'exercis', 'heart', 'rate', 'respir', 'rate', 'increas', 'level', 'organ', 'human', 'bodi', 'coordin', 'function']


In [None]:
## Checking that we have correct ngrams for some word in the data
#print (ck12_prediction.docs_tf['21st_century_tsunami.txt'].keys())

In [None]:
## PREDICTING DATA

#predict
print("run: predicting data")
start_time = time.time()
res, prob_scores = ck12_prediction.similar_score_paral(data, docs_per_q = 10, n_gram = 3, workers=N_WORKERS)
print ("elapsed time: ",round((time.time()-start_time)/60,2))
print ("finished predicting probabilities")

In [None]:
## EVALUATING SCORES

y = data.iloc[0:2500,6]
y_pred = res[0:2500]
print (evaluate_score (y_pred,y))


Running parameter query test accross the data // <br>

In [None]:
### Generating dictionary - probably too extensive

#ck12_prediction = ck12_predict_cl ()
#ck12_prediction.tf_idf_dict()

In [None]:
## Should be done in parallel

# MAKE RUN ACROSS PARAMETERS
par_arange = np.arange(3,6,1)

scor_li = []
for par_iter in par_arange:
    print("run: predicting data")
    res, prob_scores = ck12_prediction.similar_score(data, docs_per_q = par_iter, n_gram = 3)
    prob_scores = np.array(prob_scores).flatten()
    prob_scores = np.resize (prob_scores,(len(prob_scores)/4,4))
    print ("finished preciting probabilities:")
    print (prob_scores[0:2,:])

    y = data.iloc[0:2500,6]
    y_pred = res[0:2500]
    print ("parameter: ",par_iter, " : ",evaluate_score (y_pred,y))
    scor_li.append(evaluate_score (y_pred,y))
    

In [None]:
plt.plot (par_arange,scor_li)

plt.title ("Number of similar docs to look for vs Model perfomance")
plt.show()

In [None]:
## Optmal number of docs is 15, however it's good to try small too

res, prob_scores = ck12_prediction.similar_score(data, docs_per_q = 10,n_gram=3)
print ("finished predicting probabilities")
y = data.iloc[0:2500,6]
y_pred = res[0:2500]
print (evaluate_score (y_pred,y))

In [None]:
#save result
pd.DataFrame({'id': list(data['id']), 'correctAnswer': res})[['id', 'correctAnswer']].to_csv("../predictions/prediction_ck12.csv", index = False)

prob_scores = np.array(prob_scores).flatten()
prob_scores = np.resize (prob_scores,(len(prob_scores)/4,4))
pd.DataFrame({'id': list(data['id']),'probA': prob_scores[1:,0],'probB': prob_scores[1:,1],'probC': prob_scores[1:,2],'probD': prob_scores[1:,3]}).to_csv("../predictions/prob_prediction_ck12.csv", index = False)

