<a href="https://colab.research.google.com/github/Nourk1/eCornellfolder/blob/main/PredictionModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score

# Load and preprocess the data
def load_and_merge_data(donations_file, demographics_file, zipcode_file):
    # Load data
    donations_df = pd.read_csv(donations_file)
    demographics_df = pd.read_csv(demographics_file)
    zipcode_df = pd.read_csv(zipcode_file)

    # Convert date column to datetime
    donations_df['Close Date'] = pd.to_datetime(donations_df['Close Date'])

    # Rename columns for consistency
    donations_df.rename(columns={'Billing Zip/Postal Code': 'Billing_Zipcode'}, inplace=True)
    zipcode_df.rename(columns={'zip_code': 'Zip_Code', 'zcta': 'ZCTA'}, inplace=True)

    # Ensure ZIP codes are strings
    donations_df['Billing_Zipcode'] = donations_df['Billing_Zipcode'].astype(str)
    zipcode_df['Zip_Code'] = zipcode_df['Zip_Code'].astype(str)

    # Merge datasets
    donations_merged = donations_df.merge(zipcode_df, left_on='Billing_Zipcode', right_on='Zip_Code', how='left')
    donations_merged = donations_merged.merge(demographics_df, on='ZCTA', how='left')

    return donations_merged, demographics_df

# Create labels for donors
def create_labels(donations_merged, model_date):
    model_date = pd.to_datetime(model_date)
    training_donors = donations_merged[donations_merged['Close Date'] < model_date]['Masked Account ID'].unique()
    evaluation_donors = donations_merged[
        (donations_merged['Close Date'] >= model_date) &
        (donations_merged['Close Date'] < f'{model_date.year + 1}-01-01')
    ]['Masked Account ID'].unique()
    return training_donors, evaluation_donors

# Gather features for a donor
def gather_features(donor_id, donations_merged, selected_features):
    donor_data = donations_merged[donations_merged['Masked Account ID'] == donor_id]
    donor_data = donor_data[donor_data['Amount'] > 0]  # Exclude $0 donations

    # Donation statistics
    num_donations = donor_data.shape[0]
    avg_donation = donor_data['Amount'].mean()
    max_donation = donor_data['Amount'].max()

    # Extract selected demographic features
    feature_values = []
    for feature in selected_features:
        feature_values.append(donor_data[feature].mean() if feature in donor_data.columns else np.nan)

    # Combine into feature vector
    return [num_donations, avg_donation, max_donation] + feature_values

# Train and evaluate the model
# Train and evaluate the model
# Train and evaluate the model
def train_and_evaluate(X, y, donations_merged, training_donors, evaluation_donors, selected_features):

    # Balance the dataset using random oversampling
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)

    # Convert to numpy arrays
    X_res = np.array(X_res)
    y_res = np.array(y_res)

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    accuracies, f1_scores, precision_scores, recall_scores = [], [], [], []

    for train_idx, test_idx in skf.split(X_res, y_res):
        X_train, X_test = X_res[train_idx], X_res[test_idx]
        y_train, y_test = y_res[train_idx], y_res[test_idx]

        # Train the model
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)

        # Make predictions
        y_pred = rf.predict(X_test)

        # Collect metrics for evaluation

        f1_scores.append(f1_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"F1 Score: {np.mean(f1_scores):.4f}")
    print(f"Precision: {np.mean(precision_scores):.4f}")
    print(f"Recall: {np.mean(recall_scores):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


    # Make predictions on the evaluation set
    # Make predictions on the evaluation set
    evaluation_features = []
    for donor in evaluation_donors:
      feature_vector = gather_features(donor, donations_merged, selected_features)
      evaluation_features.append(feature_vector)
    # Predictions for next year (evaluation period)
    y_pred = rf.predict(evaluation_features)


    # Calculate the percentage of donors predicted to repeat
    predicted_repeat_donors = np.sum(y_pred)
    total_donors = len(y_pred)
    predicted_repeat_percentage = (predicted_repeat_donors / total_donors) * 100
    print("\nPercentage of Donors Predicted to Repeat Again:", predicted_repeat_percentage)

    return rf

# Main workflow
def main():
    # File paths
    donations_file = 'DonationsC5LA.csv'
    demographics_file = 'ACSSociodemographics.csv'
    zipcode_file = 'Zipcode.csv'

    # Load and preprocess data
    donations_merged, demographics_df = load_and_merge_data(donations_file, demographics_file, zipcode_file)

    # Define the model date
    MODEL_DATE = '2022-01-01'

    # Create labels
    training_donors, evaluation_donors = create_labels(donations_merged, MODEL_DATE)

    # Extract all demographic feature names (excluding ZCTA)
    demographic_features = [
        col for col in demographics_df.columns
        if demographics_df[col].dtype.kind in 'bifc' and col != 'ZCTA'
    ]

    # Compute feature importance to select top 3 demographic features
    all_training_features = []
    all_training_labels = []

    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, demographic_features)
        label = 1 if donor in evaluation_donors else 0
        all_training_features.append(feature_vector)
        all_training_labels.append(label)

    # Train a preliminary model to get feature importance
    rf_preliminary = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_preliminary.fit(all_training_features, all_training_labels)
    importances = rf_preliminary.feature_importances_

    # Get the top 3 demographic features
    all_features = ['num_donations', 'avg_donation', 'max_donation'] + demographic_features
    importance_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    top_demographic_features = importance_df[importance_df['Feature'].isin(demographic_features)].head(3)['Feature'].tolist()
    print("Top 3 Demographic Features:", top_demographic_features)


    # Final feature extraction with selected demographic features
    final_training_features = []
    final_training_labels = []
    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, top_demographic_features)
        label = 1 if donor in evaluation_donors else 0
        final_training_features.append(feature_vector)
        final_training_labels.append(label)

    # Train and evaluate the final model
    rf_model = train_and_evaluate(final_training_features, final_training_labels, donations_merged, training_donors, evaluation_donors, top_demographic_features)

# Run the main workflow
if __name__ == "__main__":
    main()


Top 3 Demographic Features: ['HH Income', 'Poor', '$125-150k']
Accuracy: 0.9812206572769953
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       213
           1       0.96      1.00      0.98       213

    accuracy                           0.98       426
   macro avg       0.98      0.98      0.98       426
weighted avg       0.98      0.98      0.98       426

F1 Score: 0.9622
Precision: 0.9281
Recall: 0.9991
Confusion Matrix:
[[205   8]
 [  0 213]]

Percentage of Donors Predicted to Repeat Again: 68.29268292682927


In [67]:

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Load and preprocess the data
def load_and_merge_data(donations_file, demographics_file, zipcode_file):
    # Load data
    donations_df = pd.read_csv(donations_file)
    demographics_df = pd.read_csv(demographics_file)
    zipcode_df = pd.read_csv(zipcode_file)

    # Convert date column to datetime
    donations_df['Close Date'] = pd.to_datetime(donations_df['Close Date'])

    # Rename columns for consistency
    donations_df.rename(columns={'Billing Zip/Postal Code': 'Billing_Zipcode'}, inplace=True)
    zipcode_df.rename(columns={'zip_code': 'Zip_Code', 'zcta': 'ZCTA'}, inplace=True)

    # Ensure ZIP codes are strings
    donations_df['Billing_Zipcode'] = donations_df['Billing_Zipcode'].astype(str)
    zipcode_df['Zip_Code'] = zipcode_df['Zip_Code'].astype(str)

    # Merge datasets
    donations_merged = donations_df.merge(zipcode_df, left_on='Billing_Zipcode', right_on='Zip_Code', how='left')
    donations_merged = donations_merged.merge(demographics_df, on='ZCTA', how='left')

    return donations_merged, demographics_df

# Create labels for donors
def create_labels(donations_merged, model_date):
    training_donors = donations_merged[donations_merged['Close Date'] < model_date]['Masked Account ID'].unique()
    evaluation_donors = donations_merged[
        (donations_merged['Close Date'] >= model_date) &
        (donations_merged['Close Date'] < '2022-01-01')
    ]['Masked Account ID'].unique()
    return training_donors, evaluation_donors

# Gather features for a donor
def gather_features(donor_id, donations_merged, selected_features):
    donor_data = donations_merged[donations_merged['Masked Account ID'] == donor_id]
    donor_data = donor_data[donor_data['Amount'] > 0]  # Exclude $0 donations

    # Donation statistics
    num_donations = donor_data.shape[0]
    avg_donation = donor_data['Amount'].mean()
    max_donation = donor_data['Amount'].max()

    # Extract selected demographic features
    feature_values = []
    for feature in selected_features:
        feature_values.append(donor_data[feature].mean() if feature in donor_data.columns else np.nan)

    # Combine into feature vector
    return [num_donations, avg_donation, max_donation] + feature_values

# Train and evaluate the model
def train_and_evaluate(X, y):
    # Balance the dataset using random oversampling
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

    # Train the model
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Make predictions and evaluate
    y_pred = rf.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

      # Calculate the percentage of donors predicted to repeat again
    predicted_repeat_donors = np.sum(y_pred)
    total_donors = len(y_pred)
    predicted_repeat_percentage = (predicted_repeat_donors / total_donors) * 100
    print("\nPercentage of Donors Predicted to Repeat Again:", predicted_repeat_percentage)


    return rf

# Main workflow
def main():
    # File paths
    donations_file = 'DonationsC5LA.csv'
    demographics_file = 'ACSSociodemographics.csv'
    zipcode_file = 'Zipcode.csv'

    # Load and preprocess data
    donations_merged, demographics_df = load_and_merge_data(donations_file, demographics_file, zipcode_file)

    # Define the model date
    MODEL_DATE = '2021-01-01'

    # Create labels
    training_donors, evaluation_donors = create_labels(donations_merged, MODEL_DATE)

    # Extract all demographic feature names (excluding ZCTA)
    demographic_features = [
        col for col in demographics_df.columns
        if demographics_df[col].dtype.kind in 'bifc' and col != 'ZCTA'
    ]

    # Compute feature importance to select top 3 demographic features
    all_training_features = []
    all_training_labels = []

    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, demographic_features)
        label = 1 if donor in evaluation_donors else 0
        all_training_features.append(feature_vector)
        all_training_labels.append(label)

    # Train a preliminary model to get feature importance
    rf_preliminary = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_preliminary.fit(all_training_features, all_training_labels)
    importances = rf_preliminary.feature_importances_

    # Get the top 3 demographic features
    all_features = ['num_donations', 'avg_donation', 'max_donation'] + demographic_features
    importance_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    top_demographic_features = importance_df[importance_df['Feature'].isin(demographic_features)].head(3)['Feature'].tolist()
    print("Top 3 Demographic Features:", top_demographic_features)

    # Final feature extraction with selected demographic features
    final_training_features = []
    final_training_labels = []
    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, top_demographic_features)
        label = 1 if donor in evaluation_donors else 0
        final_training_features.append(feature_vector)
        final_training_labels.append(label)

    # Train and evaluate the final model
    train_and_evaluate(final_training_features, final_training_labels)

# Run the main workflow
if __name__ == "__main__":
    main()





Top 3 Demographic Features: ['Poor Family', 'HH Income', 'Minority']
Accuracy: 0.9600997506234414
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       206
           1       0.93      0.99      0.96       195

    accuracy                           0.96       401
   macro avg       0.96      0.96      0.96       401
weighted avg       0.96      0.96      0.96       401

Confusion Matrix:
[[192  14]
 [  2 193]]

Percentage of Donors Predicted to Repeat Again: 51.6209476309227


In [72]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold

# Load and preprocess the data
def load_and_merge_data(donations_file, demographics_file, zipcode_file):
    donations_df = pd.read_csv(donations_file)
    demographics_df = pd.read_csv(demographics_file)
    zipcode_df = pd.read_csv(zipcode_file)

    donations_df['Close Date'] = pd.to_datetime(donations_df['Close Date'])
    donations_df.rename(columns={'Billing Zip/Postal Code': 'Billing_Zipcode'}, inplace=True)
    zipcode_df.rename(columns={'zip_code': 'Zip_Code', 'zcta': 'ZCTA'}, inplace=True)

    donations_df['Billing_Zipcode'] = donations_df['Billing_Zipcode'].astype(str)
    zipcode_df['Zip_Code'] = zipcode_df['Zip_Code'].astype(str)

    donations_merged = donations_df.merge(zipcode_df, left_on='Billing_Zipcode', right_on='Zip_Code', how='left')
    donations_merged = donations_merged.merge(demographics_df, on='ZCTA', how='left')

    return donations_merged, demographics_df

# Create labels for donors
def create_labels(donations_merged, model_date):
    model_date = pd.to_datetime(model_date)
    training_donors = donations_merged[donations_merged['Close Date'] < model_date]['Masked Account ID'].unique()
    evaluation_donors = donations_merged[
        (donations_merged['Close Date'] >= model_date) &
        (donations_merged['Close Date'] < f'{model_date.year + 1}-01-01')
    ]['Masked Account ID'].unique()
    return training_donors, evaluation_donors

# Gather features for a donor
def gather_features(donor_id, donations_merged, selected_features):
    donor_data = donations_merged[donations_merged['Masked Account ID'] == donor_id]
    donor_data = donor_data[donor_data['Amount'] > 0]  # Exclude $0 donations

    # Donation statistics
    num_donations = donor_data.shape[0]
    avg_donation = donor_data['Amount'].mean()
    max_donation = donor_data['Amount'].max()

    # Extract selected demographic features
    feature_values = []
    for feature in selected_features:
        feature_values.append(donor_data[feature].mean() if feature in donor_data.columns else np.nan)

    # Combine into feature vector
    return [num_donations, avg_donation, max_donation] + feature_values

# Train and evaluate the model
def train_and_evaluate(X, y, donations_merged, training_donors, evaluation_donors, selected_features):

    # Balance the dataset using random oversampling
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)

    # Convert to numpy arrays
    X_res = np.array(X_res)
    y_res = np.array(y_res)

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    accuracies, f1_scores, precision_scores, recall_scores = [], [], [], []

    # Training phase: Evaluate the training model using cross-validation
    for train_idx, test_idx in skf.split(X_res, y_res):
        X_train, X_test = X_res[train_idx], X_res[test_idx]
        y_train, y_test = y_res[train_idx], y_res[test_idx]

        # Train the model
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train, y_train)

        # Make predictions
        y_pred = rf.predict(X_test)

        # Collect metrics for training evaluation
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))

    # Print results for training run
    print("\nTraining Run Metrics:")
    print(f"Accuracy: {np.mean(accuracies):.4f}")
    print(f"F1 Score: {np.mean(f1_scores):.4f}")
    print(f"Precision: {np.mean(precision_scores):.4f}")
    print(f"Recall: {np.mean(recall_scores):.4f}")

    # Evaluation phase: Make predictions on the evaluation set (next year's data)
    evaluation_features = []
    for donor in evaluation_donors:
        feature_vector = gather_features(donor, donations_merged, selected_features)
        evaluation_features.append(feature_vector)

    # Predictions for next year (evaluation period)
    y_pred = rf.predict(evaluation_features)
    print("\nEvaluation Run Predictions (Next Year's Data):", y_pred)

    # Calculate the percentage of donors predicted to repeat
    predicted_repeat_donors = np.sum(y_pred)
    total_donors = len(y_pred)
    predicted_repeat_percentage = (predicted_repeat_donors / total_donors) * 100
    print("\nPercentage of Donors Predicted to Repeat Again:", predicted_repeat_percentage)

    return rf

# Main workflow
def main():
    # File paths
    donations_file = 'DonationsC5LA.csv'
    demographics_file = 'ACSSociodemographics.csv'
    zipcode_file = 'Zipcode.csv'

    # Load and preprocess data
    donations_merged, demographics_df = load_and_merge_data(donations_file, demographics_file, zipcode_file)

    # Define the model date
    MODEL_DATE = '2021-01-01'

    # Create labels
    training_donors, evaluation_donors = create_labels(donations_merged, MODEL_DATE)

    # Extract all demographic feature names (excluding ZCTA)
    demographic_features = [
        col for col in demographics_df.columns
        if demographics_df[col].dtype.kind in 'bifc' and col != 'ZCTA'
    ]

    # Compute feature importance to select top 3 demographic features
    all_training_features = []
    all_training_labels = []

    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, demographic_features)
        label = 1 if donor in evaluation_donors else 0
        all_training_features.append(feature_vector)
        all_training_labels.append(label)

    # Train a preliminary model to get feature importance
    rf_preliminary = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_preliminary.fit(all_training_features, all_training_labels)
    importances = rf_preliminary.feature_importances_

    # Get the top 3 demographic features
    all_features = ['num_donations', 'avg_donation', 'max_donation'] + demographic_features
    importance_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    top_demographic_features = importance_df[importance_df['Feature'].isin(demographic_features)].head(3)['Feature'].tolist()
    print("Top 3 Demographic Features:", top_demographic_features)

    # Final feature extraction with selected demographic features
    final_training_features = []
    final_training_labels = []
    for donor in training_donors:
        feature_vector = gather_features(donor, donations_merged, top_demographic_features)
        label = 1 if donor in evaluation_donors else 0
        final_training_features.append(feature_vector)
        final_training_labels.append(label)

    # Train and evaluate the final model
    rf_model = train_and_evaluate(final_training_features, final_training_labels, donations_merged, training_donors, evaluation_donors, top_demographic_features)

# Run the main workflow
if __name__ == "__main__":
    main()


Top 3 Demographic Features: ['Poor Family', 'HH Income', 'Minority']

Training Run Metrics:
Accuracy: 0.9621
F1 Score: 0.9635
Precision: 0.9295
Recall: 1.0000

Evaluation Run Predictions (Next Year's Data): [1 1 0 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 0 1 0 0
 0 1 0 0 0 0 0 0]

Percentage of Donors Predicted to Repeat Again: 65.21739130434783
