- ### Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
from statistics import mean
from scipy.sparse import hstack
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

- ### Reading Data

In [None]:
per_data = pd.read_csv('../datafiles/doners/final_data.csv')
per_data.info(),per_data.iloc[1]

In [None]:
# Since Multinomial NB does not take negative values as input, we need to drop 'std_price'
per_data.drop(['std_price'],axis=1,inplace=True) 

In [None]:
per_data.head(5)

- ## actions to perform

In [None]:
'''
 teacher_prefix                                                                   one-hot-encoding
 school_state                                                                     one-hot-encoding
 submission_y                                                                                 none   
 project_grade_category                                                           one-hot-encoding
 project_subject_categories                                                       one-hot-encoding
 project_subject_subcategories                                                    one-hot-encoding
 project_title                                                                           bow-tfidf
 title_len                                                                                    none
 teacher_number_of_previously_posted_projects                                                 none
 essay                                                                                   bow-tfidf
 essay_len                                                                                    none
 nrm_price                                                                                    none
 std_price                                                                                    none
 quantity                                                                                     none
 project_is_approved                                                                  decision-var   '''     

- ## Spliting

In [None]:
d_trainx,d_testx,d_trainy,d_testy = train_test_split(per_data.iloc[:,:-1],per_data.iloc[:,-1],stratify=per_data.iloc[:,-1],test_size=0.25, random_state=42)

In [None]:
d_trainx.shape,d_trainy.shape

In [None]:
d_trainy.value_counts(normalize=True),d_testy.value_counts(normalize=True)

In [None]:
ds_trainx,d_cvx,ds_trainy,d_cvy = train_test_split(d_trainx,d_trainy,stratify=d_trainy,test_size=0.25, random_state=42)

In [None]:
ds_trainy.value_counts(normalize=True),d_cvy.value_counts(normalize=True)

In [None]:
ds_trainy.value_counts(),d_cvy.value_counts()

- # 1) Vectorization


- ## 1.1) BOW

In [None]:
vec_essay = CountVectorizer(min_df=10)
ds_trainx_essay = vec_essay.fit_transform(ds_trainx['essay'])
d_cvx_essay=vec_essay.transform(d_cvx['essay'])
d_testx_essay = vec_essay.transform(d_testx['essay'])
ds_trainx_essay.toarray().shape,d_cvx_essay.toarray().shape,d_testx_essay.toarray().shape

In [None]:
vec_title = CountVectorizer(min_df=10)
ds_trainx_title = vec_title.fit_transform(ds_trainx['project_title'])
d_cvx_title=vec_title.transform(d_cvx['project_title'])
d_testx_title = vec_title.transform(d_testx['project_title'])
ds_trainx_title.toarray().shape,d_cvx_title.toarray().shape,d_testx_title.toarray().shape

In [None]:
vec_essay2 = CountVectorizer(min_df=10)
d_trainx_essay = vec_essay2.fit_transform(d_trainx['essay'])
d_testx_essay2 = vec_essay2.transform(d_testx['essay'])
d_trainx_essay.toarray().shape,d_testx_essay2.toarray().shape

In [None]:
vec_title2 = CountVectorizer(min_df=10)
d_trainx_title = vec_title2.fit_transform(d_trainx['project_title'])
d_testx_title2 = vec_title2.transform(d_testx['project_title'])
d_trainx_title.toarray().shape,d_testx_title2.toarray().shape

In [None]:
# ct_trn_data = per_data.drop(['essay','project_title'],axis=1)
# d_trainx.drop(['essay','project_title'],axis=1,inplace=True)cal
# ds_trainx.drop(['essay','project_title'],axis=1,inplace=True)
# d_testx.drop(['essay','project_title'],axis=1,inplace=True)
# d_cvx.drop(['essay','project_title'],axis=1,inplace=True)

In [None]:
# text_features = ['project_subject_subcategories','project_subject_categories','school_state','project_grade_category','teacher_prefix']
# text_transformer = Pipeline(steps=[
#     ('vect', CountVectorizer(binary=True))
# ])

# for x in text_features:
#     ct = ColumnTransformer(transformers=[('text', CountVectorizer(binary=True), x)])
#     ds_trainx_vec = preprocessor.fit_transform(ds_trainx)
#     print(ds_trainx_vec.toarray().shape)

# clf = Pipeline(steps=[('preprocessor', preprocessor)])
# clf.fit(ds_trainx)

vec_ct = ColumnTransformer(transformers=[('short',OneHotEncoder(),['project_subject_subcategories','project_subject_categories','school_state','project_grade_category','teacher_prefix'])],remainder='passthrough')
vec_ct.fit(per_data.drop(['essay','project_title'],axis=1).iloc[:,:-1])
ds_trainx_vec = vec_ct.transform(ds_trainx.drop(['essay','project_title'],axis=1))
d_testx_vec = vec_ct.transform(d_testx.drop(['essay','project_title'],axis=1))
d_cvx_vec = vec_ct.transform(d_cvx.drop(['essay','project_title'],axis=1))
d_trainx_vec = vec_ct.transform(d_trainx.drop(['essay','project_title'],axis=1))
# d_testx_vec2 = vec_ct.transform(d_testx)

In [None]:
ds_trainx_vec.toarray().shape,d_testx_vec.toarray().shape,d_cvx_vec.toarray().shape,d_trainx_vec.toarray().shape

In [None]:
ds_trainx_csr = hstack((ds_trainx_essay,ds_trainx_title,ds_trainx_vec)).tocsr()
d_testx_csr = hstack((d_testx_essay,d_testx_title,d_testx_vec)).tocsr()
d_cvx_csr = hstack((d_cvx_essay,d_cvx_title,d_cvx_vec)).tocsr()
d_trainx_csr = hstack((d_trainx_essay,d_trainx_title,d_trainx_vec)).tocsr()
d_testx2_csr = hstack((d_testx_essay2,d_testx_title2,d_testx_vec)).tocsr()
ds_trainx_csr.shape,d_testx_csr.shape,d_cvx_csr.shape,d_trainx_csr.shape,d_testx2_csr.shape

In [None]:
with open('../datafiles/doners/donerschoose_pickle.pickle','wb') as pik:
    pickle.dump(ds_trainx_csr,pik)
    pickle.dump(d_testx_csr,pik)
    pickle.dump(d_cvx_csr,pik)
    pickle.dump(d_trainx_csr,pik)
    pickle.dump(d_testx2_csr,pik)
    pik.close()

In [None]:
with open('../datafiles/doners/donerschoose_pickle.pickle','rb') as pik:
    ds_trainx_csr = pickle.load(pik)
    d_testx_csr = pickle.load(pik)
    d_cvx_csr = pickle.load(pik)
    d_trainx_csr = pickle.load(pik)
    d_testx2_csr = pickle.load(pik)
    pik.close()

In [None]:
with open('../datafiles/doners/donerschoose_pickle_y.pickle','wb') as pik:
    pickle.dump(ds_trainy,pik)
    pickle.dump(d_testy,pik)
    pickle.dump(d_cvy,pik)
    pickle.dump(d_trainy,pik)
    pickle.dump(d_testy,pik)
    pik.close()

- # Naive Bais

In [None]:
def Plot_roc(clf,X,Y,x,y,*cv):
    y_probability = clf.predict_proba(X)[:,1]
    fpr, tpr, threshold = roc_curve(Y, y_probability)
    roc_auc = roc_auc_score(Y, y_probability)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'g', label = 'AUC-Test = %0.2f' % roc_auc)

    y_probability = clf.predict_proba(x)[:,1]
    fpr, tpr, threshold = roc_curve(y, y_probability)
    roc_auc = roc_auc_score(y, y_probability)
    plt.plot(fpr, tpr, 'b', label = 'AUC-Train = %0.2f' % roc_auc)

    if(len(cv)==2):
        y_probability = clf.predict_proba(cv[0])[:,1]
        fpr, tpr, threshold = roc_curve(cv[1], y_probability)
        roc_auc = roc_auc_score(cv[1], y_probability)
        plt.plot(fpr, tpr, 'y', label = 'AUC-CV = %0.2f' % roc_auc)

    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
def Confusion_mat(clf,X,Y,x,y,*cv):
    plot_confusion_matrix(clf,X,Y)
    plt.title('Confusion Matrix on Test Data')
    plot_confusion_matrix(clf,x,y)
    plt.title('Confusion Matrix on Train Data')
    if(len(cv)==2):
        plot_confusion_matrix(clf,cv[0],cv[1])
        plt.title('Confusion Matrix on CV Data')
    plt.show()

In [None]:
alpha = list([10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3,10**5,10**6])
auroc=list()
for aph in alpha:
    clf = MultinomialNB(alpha=aph)
    clf.fit(ds_trainx_csr,ds_trainy)
    auroc.append(roc_auc_score(d_cvy,clf.predict_proba(d_cvx_csr)[:,1]))
clf = MultinomialNB(alpha=alpha[auroc.index(max(auroc))])
clf.fit(ds_trainx_csr,ds_trainy)
print('Best alpha:',clf.alpha)
Plot_roc(clf,d_testx_csr,d_testy,ds_trainx_csr,ds_trainy,d_cvx_csr,d_cvy)
Confusion_mat(clf,d_testx_csr,d_testy,ds_trainx_csr,ds_trainy,d_cvx_csr,d_cvy)

In [None]:
gs_clf = GridSearchCV(MultinomialNB(),{'alpha':alpha},scoring='roc_auc',cv=2,verbose=2,n_jobs=-1,return_train_score=True)
gs_clf.fit(d_trainx_csr,d_trainy)
print(gs_clf.best_estimator_)
print('='*150)
Plot_roc(gs_clf,d_testx2_csr,d_testy,d_trainx_csr,d_trainy)
Confusion_mat(gs_clf,d_testx2_csr,d_testy,d_trainx_csr,d_trainy)

In [None]:
feat_set1  = vec_essay.get_feature_names()+vec_title.get_feature_names()+vec_ct.get_feature_names()
feat_set2  = vec_essay2.get_feature_names()+vec_title2.get_feature_names()+vec_ct.get_feature_names()
indices = clf.feature_log_prob_[1].argsort()[-20:][::-1]
indices2 = clf.feature_log_prob_[0].argsort()[-20:][::-1]
indices3 = gs_clf.best_estimator_.feature_log_prob_[1].argsort()[-20:][::-1]
indices4 = gs_clf.best_estimator_.feature_log_prob_[0].argsort()[-20:][::-1]
pd.DataFrame({'(CV) positive':np.array(feat_set1)[indices],'(K-CV) positive':np.array(feat_set2)[indices3],'(CV) negative':np.array(feat_set1)[indices2],'(K-CV) negative':np.array(feat_set2)[indices4]})

# TFIDF Naive-Bayes

In [None]:
vec_essay = TfidfVectorizer(min_df=10)
ds_trainx_essay = vec_essay.fit_transform(ds_trainx['essay'])
d_cvx_essay=vec_essay.transform(d_cvx['essay'])
d_testx_essay = vec_essay.transform(d_testx['essay'])
# ds_trainx_essay.toarray().shape,d_cvx_essay.toarray().shape,d_testx_essay.toarray().shape

vec_title = TfidfVectorizer(min_df=10)
ds_trainx_title = vec_title.fit_transform(ds_trainx['project_title'])
d_cvx_title=vec_title.transform(d_cvx['project_title'])
d_testx_title = vec_title.transform(d_testx['project_title'])
# ds_trainx_title.toarray().shape,d_cvx_title.toarray().shape,d_testx_title.toarray().shape

vec_essay2 = TfidfVectorizer(min_df=10)
d_trainx_essay = vec_essay2.fit_transform(d_trainx['essay'])
d_testx_essay2 = vec_essay2.transform(d_testx['essay'])
# d_trainx_essay.toarray().shape,d_testx_essay2.toarray().shape

vec_title2 = TfidfVectorizer(min_df=10)
d_trainx_title = vec_title2.fit_transform(d_trainx['project_title'])
d_testx_title2 = vec_title2.transform(d_testx['project_title'])
d_trainx_title.toarray().shape,d_testx_title2.toarray().shape

In [None]:
ds_trainx_csr = hstack((ds_trainx_essay,ds_trainx_title,ds_trainx_vec)).tocsr()
d_testx_csr = hstack((d_testx_essay,d_testx_title,d_testx_vec)).tocsr()
d_cvx_csr = hstack((d_cvx_essay,d_cvx_title,d_cvx_vec)).tocsr()
d_trainx_csr = hstack((d_trainx_essay,d_trainx_title,d_trainx_vec)).tocsr()
d_testx2_csr = hstack((d_testx_essay2,d_testx_title2,d_testx_vec)).tocsr()
ds_trainx_csr.shape,d_testx_csr.shape,d_cvx_csr.shape,d_trainx_csr.shape,d_testx2_csr.shape

In [None]:
 with open('../datafiles/doners/donerschoose_pickle_tdifd.pickle','wb') as pik:
    pickle.dump(ds_trainx_csr,pik)
    pickle.dump(d_testx_csr,pik)
    pickle.dump(d_cvx_csr,pik)
    pickle.dump(d_trainx_csr,pik)
    pickle.dump(d_testx2_csr,pik)
    pik.close()

In [None]:
with open('../datafiles/doners/donerschoose_pickle_tdifd.pickle','rb') as pik:
    ds_trainx_csr = pickle.load(pik)
    d_testx_csr = pickle.load(pik)
    d_cvx_csr = pickle.load(pik)
    d_trainx_csr = pickle.load(pik)
    d_testx2_csr = pickle.load(pik)
    pik.close()

In [None]:
alpha = list([10**-5,10**-4,10**-3,10**-2,10**-1,10**0,10**1,10**2,10**3,10**5,10**6])
auroc=list()
for aph in alpha:
    clf = MultinomialNB(alpha=aph)
    clf.fit(ds_trainx_csr,ds_trainy)
    auroc.append(roc_auc_score(d_cvy,clf.predict_proba(d_cvx_csr)[:,1]))
clf = MultinomialNB(alpha=alpha[auroc.index(max(auroc))])
clf.fit(ds_trainx_csr,ds_trainy)
print('Best alpha:',clf.alpha)
Plot_roc(clf,d_testx_csr,d_testy,ds_trainx_csr,ds_trainy,d_cvx_csr,d_cvy)
Confusion_mat(clf,d_testx_csr,d_testy,ds_trainx_csr,ds_trainy,d_cvx_csr,d_cvy)

In [None]:
gs_clf = GridSearchCV(MultinomialNB(),{'alpha':alpha},scoring='roc_auc',cv=2,verbose=2,n_jobs=-1,return_train_score=True)
gs_clf.fit(d_trainx_csr,d_trainy)
print(gs_clf.best_estimator_)
print('='*150)
Plot_roc(gs_clf,d_testx2_csr,d_testy,d_trainx_csr,d_trainy)
Confusion_mat(gs_clf,d_testx2_csr,d_testy,d_trainx_csr,d_trainy)

In [None]:
feat_set1  = vec_essay.get_feature_names()+vec_title.get_feature_names()+vec_ct.get_feature_names()
feat_set2  = vec_essay2.get_feature_names()+vec_title2.get_feature_names()+vec_ct.get_feature_names()
indices = clf.feature_log_prob_[1].argsort()[-20:][::-1]
indices2 = clf.feature_log_prob_[0].argsort()[-20:][::-1]
indices3 = gs_clf.best_estimator_.feature_log_prob_[1].argsort()[-20:][::-1]
indices4 = gs_clf.best_estimator_.feature_log_prob_[0].argsort()[-20:][::-1]
pd.DataFrame({'(CV) positive':np.array(feat_set1)[indices],'(K-CV) positive':np.array(feat_set2)[indices3],'(CV) negative':np.array(feat_set1)[indices2],'(K-CV) negative':np.array(feat_set2)[indices4]})

---

In [None]:
# ['project_subject_subcategories','project_subject_categories','school_state','project_grade_category','teacher_prefix']
 = pd.DataFrame()
res_d_test = pd.DataFrame()
d_train = pd.concat([d_trainx,d_trainy],axis=1,ignore_index=False)
d_test = pd.concat([d_testx,d_testy],axis=1,ignore_index=False)

In [None]:
def response_coding(x,nm):
    res_d_train[nm+'_1'] = d_train.groupby(x)['project_is_approved'].transform(lambda x: sum(x)/len(x))
    res_d_train[nm+'_0'] = 1-res_d_train[nm+'_1']
    map_dict = d_train.groupby(x)['project_is_approved'].apply(mean).to_dict()
    res_d_test[nm+'_1'] = d_test[x].map(map_dict).fillna(0.5)
    res_d_test[nm+'_0'] = 1-res_d_test[nm+'_1']


In [None]:
response_coding('project_subject_subcategories','pss')

In [None]:
res_d_test,res_d_test.sum(axis=1).sum()

In [None]:
res_d_train,res_d_train.sum(axis=1).sum()

In [None]:
# response_coding('project_subject_subcategories','pss')  # Already done
response_coding('teacher_prefix','tp')
response_coding('project_grade_category','pgc')
response_coding('school_state','ss')
response_coding('project_subject_categories','psc')

In [None]:
res_d_train.head() , res_d_train.shape

In [None]:
res_d_test.head(), res_d_test.shape

In [None]:
# ds_trainx_csr = hstack((ds_trainx_essay,ds_trainx_title,ds_trainx_vec)).tocsr()
# d_testx_csr = hstack((d_testx_essay,d_testx_title,d_testx_vec)).tocsr()
# d_cvx_csr = hstack((d_cvx_essay,d_cvx_title,d_cvx_vec)).tocsr()
d_trainx_csr = hstack((d_trainx_essay,d_trainx_title,res_d_train)).tocsr()
d_testx2_csr = hstack((d_testx_essay2,d_testx_title2,res_d_test)).tocsr()
# ds_trainx_csr.shape,d_testx_csr.shape,d_cvx_csr.shape,
d_trainx_csr.shape,d_testx2_csr.shape

In [None]:
with open('../datafiles/doners/donerschoose_pickle_tdifd_res.pickle','wb') as pik:
    # pickle.dump(ds_trainx_csr,pik)
    # pickle.dump(d_testx_csr,pik)
    # pickle.dump(d_cvx_csr,pik)
    pickle.dump(d_trainx_csr,pik)
    pickle.dump(d_testx2_csr,pik)
    pik.close()