# DonorsChoose.org Project Submission## IntroductionThis notebook presents a machine learning solution to predict the approval status of project proposals submitted to DonorsChoose.org. The goal is to automate the approval prediction process, reduce processing time, improve decision-making, enhance user experience, and optimize resource allocation.

In [None]:
import pandas as pdimport numpy as npfrom datetime import datetimeimport matplotlib.pyplot as pltimport seaborn as snsimport nltkfrom nltk.corpus import stopwordsfrom nltk import pos_tagfrom nltk.tokenize import word_tokenizefrom nltk.sentiment.vader import SentimentIntensityAnalyzerfrom sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_scorefrom sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_reportfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.naive_bayes import GaussianNBimport xgboost as xgbfrom sklearn.linear_model import LogisticRegressionfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizerfrom sklearn.preprocessing import OneHotEncoder, StandardScalerfrom sklearn.decomposition import LatentDirichletAllocationfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipeline, FeatureUnionfrom sklearn.base import BaseEstimator, TransformerMixinimport shapfrom sklearn.utils.class_weight import compute_sample_weightimport plotly.graph_objects as gofrom plotly.subplots import make_subplotsimport plotly.express as pximport joblibimport osfrom sklearn.model_selection import ParameterGridfrom sklearn.metrics import roc_curve, aucfrom sklearn.linear_model import SGDClassifier, SGDRegressorfrom sklearn.feature_selection import SelectKBest, f_classiffrom ipywidgets import interactive

## Data LoadingLoad the dataset and take a preliminary look at the data structure.

In [None]:
odf = pd.read_csv('train_data.csv')odf_r = pd.read_csv('resources.csv')df = odf.copy()df_r = odf_r.copy()df.info()df_r.info()

## Exploratory Data AnalysisPerform exploratory data analysis to understand the dataset better.

In [None]:
df['project_essay'] = df[['project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4']].apply(lambda x: ','.join(x.dropna()), axis=1)df['project_essay_3_4_present'] = [0 if pd.isnull(i) else 1 for i in df['project_essay_3'].values]fig = make_subplots(rows=3, cols=2, start_cell='bottom-left', specs=[[{}, {}], [{'colspan': 2}, None], [{'colspan': 2}, None]], subplot_titles=('Presence of additional project essays vs Project Approval', 'Project Grade Category vs Project Approval', 'State vs Project approval', 'Teacher Prefix vs Project approval'))def create_bar_chart(colname, rowpos, colpos):    d1 = data.groupby(colname)['project_is_approved'].mean().round(3).sort_values(ascending=False).reset_index()    fig.add_trace(go.Bar(x=d1[colname], y=d1.project_is_approved, text=round(d1.project_is_approved, 3),textposition='auto', name=colname), row=rowpos, col=colpos)

create_bar_chart('teacher_prefix', 3, 1)
create_bar_chart('school_state', 2, 1)
create_bar_chart('project_grade_category', 1, 2)
create_bar_chart('project_essay_3_4_present', 1, 1)

# Update xaxis properties
fig.update_xaxes(title_text='teacher_prefix', row=3, col=1)
fig.update_xaxes(title_text='school_state', row=2, col=1)
fig.update_xaxes(title_text='project_grade_category', row=1, col=2)
fig.update_xaxes(title_text='project_essay_3_4_present', row=1, col=1)
fig.update_layout(height=750)
fig.show()

## Data Preprocessing and Feature EngineeringPrepare the data for modeling by handling missing data, encoding categorical variables, and creating new features.

In [None]:
# Merge resource data with project data
df_r['total_price'] = df_r['quantity'] * df_r['price']
df_r_g = df_r.groupby('id')[['quantity', 'total_price']].sum().reset_index()
dft = df.merge(df_r_g, on='id', how='left')

# Feature Engineering: Split categories, extract date features, and one-hot encode categorical variables
dfc = dft.copy()
dfc = split_categories(dfc, 'project_subject_categories')
dfc = split_categories(dfc, 'project_subject_subcategories')
dfc = extract_date_features(dfc, 'project_submitted_datetime')
dfc = one_hot_encode_with_prefix(dfc, 'teacher_prefix')
dfc = one_hot_encode_with_prefix(dfc, 'school_state')
dfc = one_hot_encode_with_prefix(dfc, 'project_grade_category')


## Model Training and Hyperparameter SearchTrain various machine learning models and perform hyperparameter tuning to optimize model performance.

In [None]:
# Split data into features and target
target = 'project_is_approved'
X = dfc[[i for i in dfc.columns if i != target]].copy()
y = dfc[target].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

# Define and train models
model_log = LogisticRegression(class_weight='balanced')
model_tree = DecisionTreeClassifier(class_weight='balanced')
model_gauss = GaussianNB()
model_sgd = SGDClassifier()
model_log.fit(X_train, y_train)
model_tree.fit(X_train, y_train)
model_gauss.fit(X_train, y_train)
model_sgd.fit(X_train, y_train)

# Hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid_log = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
grid_log = GridSearchCV(model_log, param_grid_log, cv=5, scoring='accuracy')
grid_log.fit(X_train, y_train)

param_grid_tree = {'max_depth': [10, 20, 30],, 'min_samples_split': [2, 5, 10]}
grid_tree = GridSearchCV(model_tree, param_grid_tree, cv=5, scoring='accuracy')
grid_tree.fit(X_train, y_train)

# Evaluate the best models from hyperparameter tuning
best_log = grid_log.best_estimator_
best_tree = grid_tree.best_estimator_
evaluate_model(best_log, X_test, y_test)
evaluate_model(best_tree, X_test, y_test)


## Model Evaluation
Evaluate the performance of the trained models using various metrics.

In [None]:
def evaluate_model(model, X_test, y_test):
 y_pred = model.predict(X_test)
 accuracy = accuracy_score(y_test, y_pred)
 precision = precision_score(y_test, y_pred, average='binary')
 recall = recall_score(y_test, y_pred, average='binary')
 print(f"Accuracy: {accuracy:.2f}")
 print(f"Precision: {precision:.2f}")
 print(f"Recall: {recall:.2f}")
 print("
Classification Report:
", classification_report(y_test, y_pred))
 cm = confusion_matrix(y_test, y_pred)
 sns.heatmap(cm, annot=True, fmt='d')
 plt.show()

evaluate_model(model_log, X_test, y_test)
evaluate_model(model_tree, X_test, y_test)
evaluate_model(model_gauss, X_test, y_test)
evaluate_model(model_sgd, X_test, y_test)


## Conclusion
Summarize the findings and suggest next steps for improving model performance.

The analysis and modeling process revealed several insights into factors that influence project approval. Moving forward, further exploration into feature engineering and advanced modeling techniques could potentially improve the predictive performance. Additionally, deploying the model into a production environment to provide real-time predictions could be beneficial for strategic decision-making.