In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

In [None]:
data = pd.read_csv('Preprocessed_DonorsChoose_dataset.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.drop('id', axis=1, inplace=True)

In [None]:
data.head(1)

In [None]:
X = data.drop('project_is_approved', axis=1)
y = data['project_is_approved']

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42)

In [None]:
bow = CountVectorizer(min_df=5, ngram_range=(1,4), max_features=5000)
bow.fit(X_train['cleaned_summary'])

X_train_cleaned_sum_bow = bow.transform(X_train['cleaned_summary'])
X_test_cleaned_sum_bow = bow.transform(X_test['cleaned_summary'])

print("After vectorizations")
print(X_train_cleaned_sum_bow.shape, y_train.shape)
print(X_test_cleaned_sum_bow.shape, y_test.shape)

In [None]:
tfidf = TfidfVectorizer(min_df=5, ngram_range=(1,4), max_features=5000)

X_train_cleaned_essays_tfidf = tfidf.fit_transform(X_train['cleaned_essays'])
X_test_cleaned_essays_tfidf = tfidf.transform(X_test['cleaned_essays'])

print("After vectorizations")
print(X_train_cleaned_essays_tfidf.shape, y_train.shape)
print(X_test_cleaned_essays_tfidf.shape, y_test.shape)

In [None]:
essay_bow = CountVectorizer(min_df=5, ngram_range=(1,4), max_features=5000)

X_train_essay_bow = essay_bow.fit_transform(X_train['cleaned_essays'])
X_test_essay_bow = essay_bow.transform(X_test['cleaned_essays'])

print("After vectorizations")
print(X_train_essay_bow.shape, y_train.shape)
print(X_test_essay_bow.shape, y_test.shape)

Encoding categorical features: School State

In [None]:
school_state_bow = CountVectorizer()

X_train_school_state_bow = school_state_bow.fit_transform(X_train['school_state'])
X_test_school_state_bow = school_state_bow.transform(X_test['school_state'])

print("After vectorizations")
print(X_train_school_state_bow.shape, y_train.shape)
print(X_test_school_state_bow.shape, y_test.shape)

In [None]:
teacher_prefix_bow = CountVectorizer()

X_train_teacher_prefix_bow = teacher_prefix_bow.fit_transform(X_train['teacher_prefix'])
X_test_teacher_prefix_bow = teacher_prefix_bow.transform(X_test['teacher_prefix'])

print("After vectorizations")
print(X_train_teacher_prefix_bow.shape, y_train.shape)
print(X_test_teacher_prefix_bow.shape, y_test.shape)

In [None]:
project_grade_category_bow = CountVectorizer()

X_train_project_grade_category_bow = project_grade_category_bow.fit_transform(X_train['project_grade_category'])
X_test_project_grade_category_bow = project_grade_category_bow.transform(X_test['project_grade_category'])

print("After vectorizations")
print(X_train_project_grade_category_bow.shape, y_train.shape)
print(X_test_project_grade_category_bow.shape, y_test.shape)

In [None]:
project_subject_categories_bow = CountVectorizer()

X_train_project_subject_categories_bow = project_subject_categories_bow.fit_transform(
                                                X_train['project_subject_categories'])
X_test_project_subject_categories_bow = project_subject_categories_bow.transform(
                                                X_test['project_subject_categories'])

print("After vectorizations")
print(X_train_project_subject_categories_bow.shape, y_train.shape)
print(X_test_project_subject_categories_bow.shape, y_test.shape)

In [None]:
project_subject_subcategories_bow = CountVectorizer()

X_train_project_subject_subcategories_bow = project_subject_subcategories_bow.fit_transform(
                                                        X_train['project_subject_subcategories'])
X_test_project_subject_subcategories_bow = project_subject_subcategories_bow.transform(
                                                        X_test['project_subject_subcategories'])

print("After vectorizations")
print(X_train_project_subject_subcategories_bow.shape, y_train.shape)
print(X_test_project_subject_subcategories_bow.shape, y_test.shape)

In [None]:
data.head(1)

Encoding numerical features

In [None]:
normalizer = Normalizer()

X_train_numerical = normalizer.fit_transform(X_train[['teacher_number_of_previously_posted_projects', 
                                                      'price', 'quantity', 
                                                      'isdigit_summary']])
X_test_numerical = normalizer.transform(X_test[['teacher_number_of_previously_posted_projects', 
                                                'price', 'quantity', 
                                                'isdigit_summary']])

X_train_numerical.shape, X_test_numerical.shape

In [None]:
X_train_numerical = X_train_numerical.reshape(-1, 4)
X_test_numerical = X_test_numerical.reshape(-1, 4)

X_train_numerical.shape, X_test_numerical.shape

Concatinating all features

Set-1

Cat, num features +preprocessed_essay_bow

In [None]:
from scipy.sparse import hstack

X_train_set_one = hstack((X_train_school_state_bow, X_train_teacher_prefix_bow, 
                          X_train_project_grade_category_bow, 
                          X_train_project_subject_categories_bow, 
                          X_train_project_subject_subcategories_bow, 
                          X_train_numerical, X_train_essay_bow)).tocsr()
X_test_set_one = hstack((X_test_school_state_bow, 
                         X_test_teacher_prefix_bow, 
                         X_test_project_grade_category_bow, 
                         X_test_project_subject_categories_bow, 
                         X_test_project_subject_subcategories_bow, 
                         X_test_numerical, X_test_essay_bow)).tocsr()

In [None]:
X_train_set_one.shape, X_test_set_one.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

model = MultinomialNB(class_prior=[0.5, 0.5])

param = {'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}

grid_one = GridSearchCV(model, param, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_one.fit(X_train_set_one, y_train)
print("Best parameters: ", grid_one.best_params_)
print("Best score ", grid_one.best_score_)

Set-2

essays-tfidf + cat feat+num feat

In [None]:
from scipy.sparse import hstack

X_train_set_two = hstack((X_train_school_state_bow, X_train_teacher_prefix_bow, 
                          X_train_project_grade_category_bow, 
                          X_train_project_subject_categories_bow, 
                          X_train_project_subject_subcategories_bow, 
                          X_train_numerical, X_train_cleaned_essays_tfidf)).tocsr()
X_test_set_two = hstack((X_test_school_state_bow, X_test_teacher_prefix_bow, 
                         X_test_project_grade_category_bow, 
                         X_test_project_subject_categories_bow, 
                         X_test_project_subject_subcategories_bow, 
                         X_test_numerical, X_test_cleaned_essays_tfidf)).tocsr()

In [None]:
model = MultinomialNB(class_prior=[0.5, 0.5])

param = {'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]}
grid_two = GridSearchCV(model, param, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_two.fit(X_train_set_two, y_train)
print("Best parameters: ", grid_two.best_params_)
print("Best score ", grid_two.best_score_)

In [None]:
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
cm=metrics.confusion_matrix(y_train,predict_with_best_t(y_train_probs, best_t))  
# https://stackoverflow.com/questions/35572000/how-can-i-plot-a-confusion-matrix

print("CONFUSION MATRIX OF TRAIN DATA")
print("\n")
print(cm)
sns.heatmap(cm, annot=True, fmt='d',cmap='GnBu')