# Data Preprocessing

This notebook goal is to preprocess the data for the DonorsChoose.org Kaggle challenge. From the data exploration (EDA.ipynb) it was possible to understand what are the columns that need to be preprocessed and how they should be handled for the data to be eligible has an input to a machine learning model.

I will do the preprocessing of the data dataset by dataset, in the end I will merge all the resultant datasets.

In [53]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from wordcloud import WordCloud
from collections import Counter
import operator
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import time
from sklearn.preprocessing import LabelEncoder
from geopy.geocoders import Nominatim
from collections import Counter

from keras.models import Model, load_model
from keras import regularizers, losses

In [54]:
# to import from parallel directory
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

from src.aux_functions import *

## Outcomes.csv

In this dataset we will only use the column is_exciting. Since this dataset has only information from the training set, all the other parameters are useless for the model.

Regarding the information on the EDA.ipynb, I will only use the data starting in 01-2010 until 11-2013.

In [3]:
# Load dataset outcomes 
outcomes = pd.read_csv('outcomes.csv')

# Load dataset project to extract date when the project was posted
projects = pd.read_csv('projects.csv')

In [4]:
# Extract only the project after 2009

merged = pd.merge(projects[['projectid','date_posted']], outcomes[['projectid','is_exciting']], on='projectid')
merged['year'] = pd.DatetimeIndex(merged['date_posted']).year
after2010 = merged[merged['year'] > 2009]


In [5]:
after2010.head(n=1)

Unnamed: 0,projectid,date_posted,is_exciting,year
0,62526d85d2a1818432d03d600969e99c,2013-12-31,f,2013


In [6]:
train_y = after2010[['projectid', 'is_exciting', 'year']]

In [7]:
train_y = train_y.replace({'is_exciting':{'t':1, 'f':0}})


For now I will save the year because if I intend to use the data as validation, the validation year needs to be posterio than the training data.

In [8]:
# Save the data in file train_y.csv
train_y.to_pickle('train_y.csv')

## Projects.csv



In [9]:
# load dataset
projects = pd.read_csv('projects.csv')
projects.head(n=1)


Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,...,resource_type,poverty_level,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted
0,316ed8fb3b81402ff6ac8f721bb31192,42d43fa6f37314365d08692e08680973,c0e6ce89b244764085691a1b8e28cb81,63627010000.0,36.57634,-119.608713,Selma,CA,93662.0,,...,Books,highest poverty,Grades 6-8,30.0,555.81,653.89,32.0,f,f,2014-05-12


In [10]:
projects.shape

(664098, 35)

In [11]:
# Extract only the project after 2009

projects['year'] = pd.DatetimeIndex(projects['date_posted']).year
after2010 = projects[projects['year'] > 2009]

projects = pd.merge(projects, after2010['projectid'], on='projectid')

del projects['year']

In [12]:
projects.shape

(484371, 35)

In [13]:
# columns with nan values, after excluding data before 2010
projects.columns[projects.isna().sum() > 0]

Index(['school_ncesid', 'school_zip', 'school_metro', 'school_district',
       'teacher_prefix', 'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'grade_level', 'students_reached'],
      dtype='object')

#### Functions to clean Projects Dataset

In [14]:
def get_zipcode(projects):
    # Replace NaN values in column school_zip
    locator = Nominatim(user_agent='myGeocoder')

    nanschool_zip = projects[projects['school_zip'].isnull()].index

    for i in nanschool_zip:
        # Use school_latitude and school_longitude columns
        projects['school_zip'][i] = locator.reverse(str(projects['school_latitude'][i]) + \
                                                    ' ,' + str(projects['school_longitude'][i])).raw['address'][
            'postcode']
    print('Get_zipcode done')
    return projects


def get_metro(projects):
    # Replace NaN values in column school_metro
    nanSchool_metro = projects[projects['school_metro'].isnull()].index
    c = 0

    for i in nanSchool_metro:

        new_value = projects[projects['school_city'] == projects['school_city'][i]]['school_metro'].value_counts()

        if len(new_value) <= 0:
            projects['school_metro'][i] = 'other'  # corresponding to 'Other'
            continue

        c += 1
        projects['school_metro'][i] = new_value.index[0]
        
    print('Get_metro done')
    return projects


def projReplaceNan(projects):

    projects['teacher_prefix'] = projects['teacher_prefix'].replace(np.nan, 'Other', regex=True)

    # Fill NaN of secondary_focus_subject with primary_focus_subject, encode and replace rest of NaN

    projects['secondary_focus_subject'].fillna(projects['primary_focus_subject'], inplace=True)
    projects['secondary_focus_subject'] = projects['secondary_focus_subject'].replace(np.nan, 'Other', regex=True)

    # Fill NaN of secondary_focus_subject with primary_focus_area

    projects['secondary_focus_area'].fillna(projects['primary_focus_area'], inplace=True)
    projects['secondary_focus_area'] = projects['secondary_focus_area'].replace(np.nan, 'Other', regex=True)

    projects['primary_focus_subject'] = projects['primary_focus_subject'].replace(np.nan, 'Other', regex=True)
    projects['primary_focus_area'] = projects['primary_focus_area'].replace(np.nan, 'Other', regex=True)
    projects['resource_type'] = projects['resource_type'].replace(np.nan, 'NaN', regex=True)
    projects['grade_level'] = projects['grade_level'].replace(np.nan, 'NaN', regex=True)
    return projects


def cleanProjectsDataset(projects):

    # Replace NaN values in column school_zip
    projects = get_zipcode(projects)

    # Replace NaN values in column school_metro
    projects = get_metro(projects)


    # Convert object type column to boolean
    projects = projects.replace({'school_charter': {'t': 1, 'f': 0},
                                 'school_magnet': {'t': 1, 'f': 0},
                                 'school_year_round': {'t': 1, 'f': 0},
                                 'school_nlns': {'t': 1, 'f': 0},
                                 'school_kipp': {'t': 1, 'f': 0},
                                 'school_charter_ready_promise': {'t': 1, 'f': 0},
                                 'teacher_teach_for_america': {'t': 1, 'f': 0},
                                 'teacher_ny_teaching_fellow': {'t': 1, 'f': 0},
                                 'eligible_double_your_impact_match': {'t': 1, 'f': 0},
                                 'eligible_almost_home_match': {'t': 1, 'f': 0}})

    projects = projReplaceNan(projects)

    # Convert date column to columns Year, Month and Day
    projects['year_posted'] = pd.DatetimeIndex(projects['date_posted']).year
    projects['month_posted'] = pd.DatetimeIndex(projects['date_posted']).month
    projects['day_posted'] = pd.DatetimeIndex(projects['date_posted']).day

    # Delete date_posted column
    del projects['date_posted']
    del projects['school_district']
    del projects['school_county']
    
    return projects


In [15]:
newProjects = cleanProjectsDataset(projects)

In [16]:
newProjects.dtypes

projectid                                  object
teacher_acctid                             object
schoolid                                   object
school_ncesid                             float64
school_latitude                           float64
school_longitude                          float64
school_city                                object
school_state                               object
school_zip                                float64
school_metro                               object
school_charter                              int64
school_magnet                               int64
school_year_round                           int64
school_nlns                                 int64
school_kipp                                 int64
school_charter_ready_promise                int64
teacher_prefix                             object
teacher_teach_for_america                   int64
teacher_ny_teaching_fellow                  int64
primary_focus_subject                      object


In [17]:
newProjects.head()

Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,...,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,year_posted,month_posted,day_posted
0,316ed8fb3b81402ff6ac8f721bb31192,42d43fa6f37314365d08692e08680973,c0e6ce89b244764085691a1b8e28cb81,63627010000.0,36.57634,-119.608713,Selma,CA,93662.0,rural,...,Grades 6-8,30.0,555.81,653.89,32.0,0,0,2014,5,12
1,90de744e368a7e4883223ca49318ae30,864eb466462bf704bf7a16a585ef296a,d711e47810900c96f26a5d0be30c446d,483702000000.0,32.911179,-96.72364,Dallas,TX,75243.0,urban,...,Grades PreK-2,30.0,296.47,348.79,22.0,0,0,2014,5,12
2,32943bb1063267de6ed19fc0ceb4b9a7,37f85135259ece793213aca9d8765542,665c3613013ba0a66e3a2a26b89f1b68,410327000000.0,45.166039,-122.414576,Colton,OR,97017.0,rural,...,Grades PreK-2,30.0,430.89,506.93,17.0,0,0,2014,5,11
3,bb18f409abda2f264d5acda8cab577a9,2133fc46f951f1e7d60645b0f9e48a6c,4f12c3fa0c1cce823c7ba1df57e90ccb,360015300000.0,40.641727,-73.965655,Brooklyn,NY,11226.0,urban,...,Grades 3-5,30.0,576.07,677.73,12.0,0,0,2014,5,11
4,24761b686e18e5eace634607acbcc19f,867ff478a63f5457eaf41049536c47cd,10179fd362d7b8cf0e89baa1ca3025bb,62271000000.0,34.043939,-118.288371,Los Angeles,CA,90006.0,urban,...,Grades PreK-2,30.0,408.4,480.47,24.0,0,0,2014,5,11


In [18]:
# Save dataset
newProjects.to_pickle('clean_projects.csv')

## Resources.csv

In [19]:
# load dataset
resources = pd.read_csv('resources.csv')
resources.head(n=1)


Unnamed: 0,resourceid,projectid,vendorid,vendor_name,project_resource_type,item_name,item_number,item_unit_price,item_quantity
0,8a1c1c45bc30d065061912fd9114fcf3,ffffc4f85b60efc5b52347df489d0238,430.0,Woodwind and Brasswind,Technology,iPod nano 4th Gen 8GB (Black),249995.001,149.0,4.0


In [20]:
resources.shape

(3667217, 9)

In [22]:
# Extract only the project after 2009
projects = pd.read_csv('projects.csv')

projects['year'] = pd.DatetimeIndex(projects['date_posted']).year
after2010 = projects[projects['year'] > 2009]

resources = pd.merge(resources, after2010['projectid'], on='projectid')


In [23]:
resources.shape

(2749806, 9)

In [24]:
def cleanResourcesDataset(resources, encoders_test=None):
    # Get the amount spent by each project

    # multiply column item_unit_price by item_quantity -> item_total_quantity
    resources['item_total_quantity'] = resources['item_unit_price'] * resources['item_quantity']

    # group resources[['projectid', item_total_quantity]] by projectid and sum values of item_total_quantity
    groupedResources =resources.groupby(['projectid']).sum()

    # Get main vendor for each projectid

    # Count num of vendor ocurrences for each projectid, get the max count
    resources['vendor_name'] = resources['vendor_name'].replace(np.nan, '', regex=True)

    s = resources.groupby(['projectid','vendor_name']).size()
    majorVendor = s.loc[s.groupby(level=0).idxmax()].reset_index().drop(0,axis=1)

    # Delete columns item_name, item_number, item_unit_price, item_quantity, vendorid, resourceid, project_resource_type
    newResources = pd.merge(groupedResources[['item_quantity', 'item_total_quantity']], majorVendor, on='projectid')
    
    return newResources
    

In [25]:
newResources = cleanResourcesDataset(resources)

In [26]:
newResources.head(n=3)

Unnamed: 0,projectid,item_quantity,item_total_quantity,vendor_name
0,00001ccc0e81598c4bd86bacb94d7acb,55.0,1225.44,Lakeshore Learning Materials
1,00002bff514104264a6b798356fdd893,7.0,399.74,Lakeshore Learning Materials
2,00002d691c05c51a5fdfbb2baef0ba25,122.0,774.8,AKJ Books


In [27]:
newResources.dtypes

projectid               object
item_quantity          float64
item_total_quantity    float64
vendor_name             object
dtype: object

In [28]:
newResources.shape

(484371, 4)

In [29]:
# Save clean dataset 
newResources.to_pickle('clean_resources.csv')

## Essays.csv



In [30]:
# Load dataset essays
essays = pd.read_csv('essays.csv')
essays.head(n=1)

Unnamed: 0,projectid,teacher_acctid,title,short_description,need_statement,essay
0,ffffc4f85b60efc5b52347df489d0238,c24011b20fc161ed02248e85beb59a90,iMath,It is imperative that teachers bring technolog...,My students need four iPods.,I am a fourth year fifth grade math teacher. T...


In [31]:
essays.shape

(664098, 6)

In [32]:
# Extract only the project after 2009
projects = pd.read_csv('projects.csv')

projects['year'] = pd.DatetimeIndex(projects['date_posted']).year
after2010 = projects[projects['year'] > 2009]

essays = pd.merge(essays, after2010['projectid'], on='projectid')

essays.shape

(484371, 6)

In [34]:
def cleanEssayDataset(essays):

    # Replace the NaN by empty strings
    essays = essays.replace(np.nan, '', regex=True)

    # Load pretrained MLP model
    mlp_model = load_model('/Users/anacosta/Desktop/KaggleComp_DonorsChoose/' + 'saved_models/mlp.h5',
                           custom_objects={'custom_loss': custom_loss})

    fast = load_fastext()
    
    # Preprocess the column Title
    # tokenize the text
    tokenize_text(essays, 'title')

    # clean, lemmatize, get embeddings and predict sentiment
    predict_sentiment(essays, 'title', mlp_model, fast)

    # fill NaN Rows with [0,0,0]
    essays.loc[essays['title'].isnull(), ['title']] = essays.loc[essays['title'].isnull(), 'title'].apply(
        lambda x: list([0, 0, 0]))

    # separate column into 3
    aux = pd.DataFrame(essays['title'].to_list(), columns=['title_v', 'title_a', 'title_d'])

    # Append title_v and title_a to dataset
    newEssays = pd.merge(aux[['title_v', 'title_a']], essays, left_index=True, right_index=True)

    # Remove Title
    del newEssays['title']
    print('- Title column cleaned')

    
    # Preprocess the column short_description
    newEssays = newEssays.replace(np.nan, '', regex=True)
    # Preprocess the column short_description

    # tokenize the text
    tokenize_text(newEssays, 'short_description')

    # Get column numb_words_short_description
    # count the number of words
    newEssays['numb_words_short_description'] = newEssays['short_description'].apply(lambda x: len(x))

    clean_stopwords(newEssays, 'short_description')
    lematize(newEssays, 'short_description')

    # Count number of words
    counter = Counter()
    _ = newEssays['short_description'].apply(lambda x: counter.update(x))
    
    
    # Get column most_common_short_description
    most_common50 = counter.most_common(100)

    most_common_short_description = newEssays['short_description'].apply(
        lambda x: pd.Series(x).isin(most_common50).value_counts())

    newEssays['most_common_short_description'] = newEssays[
                                                     'numb_words_short_description'] - most_common_short_description.squeeze()
    print('- Short_description column cleaned')
    
    newEssays = newEssays.replace(np.nan, '', regex=True)

    # Preprocess the column essays
    # tokenize the text
    tokenize_text(newEssays, 'essay')

    # Get column numb_words_essay
    # count the number of words
    newEssays['numb_words_essay'] = newEssays['essay'].apply(lambda x: len(x))

    # Remove columns
    del newEssays['short_description']
    del newEssays['need_statement']
    del newEssays['essay']
    del newEssays['teacher_acctid']
    
    newEssays['most_common_short_description'] = newEssays['most_common_short_description'].replace('', 0, regex=True)


    return newEssays

In [35]:
newEssays = cleanEssayDataset(essays)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.




- Title column cleaned
- Short_description column cleaned


In [36]:
newEssays.dtypes

title_v                          float64
title_a                          float64
projectid                         object
numb_words_short_description       int64
most_common_short_description    float64
numb_words_essay                   int64
dtype: object

In [37]:
# Saving clean essays
newEssays.to_pickle('clean_essays.csv')


## Merge all datasets

In [38]:
clean_Essays = pd.read_pickle('./clean_essays.csv')

clean_Projects = pd.read_pickle('./clean_projects.csv')

clean_Resources = pd.read_pickle('./clean_resources.csv')


In [39]:
clean_Essays.shape


(484371, 6)

In [40]:
clean_Projects.shape


(484371, 35)

In [41]:
clean_Resources.shape

(484371, 4)

In [42]:
mergedX = pd.merge(clean_Essays, clean_Projects, on='projectid')


In [43]:
mergedX = pd.merge(mergedX, clean_Resources, on='projectid')


In [44]:
mergedX.shape

(484371, 43)

In [45]:
mergedX.columns

Index(['title_v', 'title_a', 'projectid', 'numb_words_short_description',
       'most_common_short_description', 'numb_words_essay', 'teacher_acctid',
       'schoolid', 'school_ncesid', 'school_latitude', 'school_longitude',
       'school_city', 'school_state', 'school_zip', 'school_metro',
       'school_charter', 'school_magnet', 'school_year_round', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
       'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'fulfillment_labor_materials',
       'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'year_posted', 'month_posted', 'day_posted', 'item_quantity',
       'item_total_quantity',

### Saving Train Dataset

In [48]:
train_mergedX = mergedX[mergedX['year_posted'] < 2014]

train_mergedX.shape


(439599, 43)

In [49]:
# Saving train dataset
train_mergedX.to_pickle('train_x.csv')


### Saving Teste Dataset

In [50]:
idProjTest = pd.read_csv('./sampleSubmission.csv')
idProjTest.shape


(44772, 2)

In [51]:
test_mergedX = pd.merge(mergedX, idProjTest['projectid'], on='projectid')
test_mergedX.shape


(44772, 43)

In [52]:
# Saving train dataset
test_mergedX.to_pickle('test_x.csv')



In [57]:
idProjTest = pd.read_pickle('test_x.csv')

In [59]:
idProjTest.dtypes

title_v                                   float64
title_a                                   float64
projectid                                  object
numb_words_short_description                int64
most_common_short_description             float64
numb_words_essay                            int64
teacher_acctid                             object
schoolid                                   object
school_ncesid                             float64
school_latitude                           float64
school_longitude                          float64
school_city                                object
school_state                               object
school_zip                                float64
school_metro                               object
school_charter                              int64
school_magnet                               int64
school_year_round                           int64
school_nlns                                 int64
school_kipp                                 int64
