# Vectorization on DonorsChoose dataset

In [3]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import os

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

In [31]:
dataset  = pd.read_csv('preprocessed_data.csv')
print('shape->',dataset.shape)
dataset.head()

shape-> (109248, 9)


Unnamed: 0,school_state,teacher_prefix,project_grade_category,teacher_number_of_previously_posted_projects,project_is_approved,clean_categories,clean_subcategories,essay,price
0,ca,mrs,grades_prek_2,53,1,math_science,appliedsciences health_lifescience,i fortunate enough use fairy tale stem kits cl...,725.05
1,ut,ms,grades_3_5,4,1,specialneeds,specialneeds,imagine 8 9 years old you third grade classroo...,213.03
2,ca,mrs,grades_prek_2,10,1,literacy_language,literacy,having class 24 students comes diverse learner...,329.0
3,ga,mrs,grades_prek_2,2,1,appliedlearning,earlydevelopment,i recently read article giving students choice...,481.04
4,wa,mrs,grades_3_5,2,1,literacy_language,literacy,my students crave challenge eat obstacles brea...,17.74


# Vectorizing Text data

## Bag of words

In [33]:
preprocessed_essays = dataset['essay'].values
countvect = CountVectorizer(min_df = 10)
text_bow = countvect.fit_transform(preprocessed_essays)
print('Shape of matrix after BOW vectorization ->', text_bow.shape)

Shape of matrix after BOW vectorization -> (109248, 16623)


## TFIDF Vectorizer

In [34]:
vect = TfidfVectorizer(min_df = 10)
text_tfidf = vect.fit_transform(preprocessed_essays)
print('shape of matrix after TFIDF vectoriztion ->',text_tfidf.shape)

shape of matrix after TFIDF vectoriztion -> (109248, 16623)


## Average W2V

#### glove_vectors has all the unique text words of project_title and project_essays from corpus

In [35]:
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())

In [45]:
average_w2v = []
for i in preprocessed_essays:
    vectors = np.zeros(300) #vector of size 300 for each word
    countvect = 0
    for j in i.split():
        if j in glove_words:
            vectors += model[j]
            countvect += 1
    if countvect != 0:
        vectors /= countvect
    average_w2v.append(vectors)

print(len(average_w2v))
print(len(average_w2v[0]))

109248
300


## TFIDF weighted W2V

In [46]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [47]:
tfidf_w2v_vectors = [];
for sentence in tqdm(preprocessed_essays):
    vector = np.zeros(300) #vector of size 300 for each word
    tf_idf_weight =0; #num of words with a valid vector in the sentence/review
    for word in sentence.split():
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors.append(vector)

print(len(tfidf_w2v_vectors))
print(len(tfidf_w2v_vectors[0]))

100%|█████████████████████████████████████████████████████████████████████████| 109248/109248 [08:01<00:00, 226.97it/s]

109248
300





## Vectorizing categorical features

In [49]:
countvect = CountVectorizer(binary = True)
state_values_ohe = countvect.fit_transform(dataset['school_state'].values)
print('shape of school state categorical feature->', state_values_ohe.shape)

shape of school state categorical feature-> (109248, 51)


In [50]:
countvect = CountVectorizer(binary = True)
teacher_prefix_ohe = countvect.fit_transform(dataset['teacher_prefix'].values)
print('shape of teacher prefix categorical feature->', teacher_prefix_ohe.shape)

shape of teacher prefix categorical feature-> (109248, 5)


In [53]:
countvect = CountVectorizer(binary = True)
grade_ohe = countvect.fit_transform(dataset['project_grade_category'].values)
print('shape of project_grade_category categorical feature->', grade_ohe.shape)

shape of project_grade_category categorical feature-> (109248, 4)


In [54]:
countvect = CountVectorizer(binary = True)
clean_categories_ohe = countvect.fit_transform(dataset['clean_categories'].values)
print('shape of clean categorical feature->', clean_categories_ohe.shape)

shape of clean categorical feature-> (109248, 9)


In [55]:
countvect = CountVectorizer(binary = True)
clean_subcategories_ohe = countvect.fit_transform(dataset['clean_subcategories'].values)
print('shape of clean subcategorical feature->', clean_subcategories_ohe.shape)

shape of clean subcategorical feature-> (109248, 30)
