In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
categorical_data = pd.read_csv('categorical.csv')
numerical_data = pd.read_csv('numerical.csv')
target_data = pd.read_csv('target.csv')

In [5]:
categorical_data.columns

Index(['STATE', 'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R',
       'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM'],
      dtype='object')

In [6]:
numerical_data.columns

Index(['TCODE', 'AGE', 'INCOME', 'WEALTH1', 'HIT', 'MALEMILI', 'MALEVET',
       'VIETVETS', 'WWIIVETS', 'LOCALGOV',
       ...
       'CARDGIFT', 'MINRAMNT', 'MAXRAMNT', 'LASTGIFT', 'TIMELAG', 'AVGGIFT',
       'CONTROLN', 'HPHONE_D', 'RFA_2F', 'CLUSTER2'],
      dtype='object', length=315)

In [7]:
target_data.columns

Index(['TARGET_B', 'TARGET_D'], dtype='object')

In [9]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = pd.concat([categorical_data, numerical_data], axis=1)

# One-hot encode categorical features
X = pd.get_dummies(X, columns=categorical_data.columns)

# Extracting target variable (y)
y = target_data['TARGET_B']

# Performing train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using RandomOverSampler to upsample the minority class
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Initializing and training the Random Forest Classifier on the resampled data
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')




Accuracy: 0.9477545459309333
Confusion Matrix:
[[18084    21]
 [  976     2]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18105
           1       0.09      0.00      0.00       978

    accuracy                           0.95     19083
   macro avg       0.52      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



The model has high accuracy of 94% but struggles with correctly identifying the minority class.
In a business scenario, this could lead to missing important positive instances, impacting decision-making.

In [14]:
#Evaluation matrix
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'AUC-ROC: {roc_auc}')


Precision: 0.08695652173913043
Recall: 0.002044989775051125
F1-Score: 0.003996003996003996
AUC-ROC: 0.5703642803934558


In [15]:
from sklearn.utils import resample

# Concatenate X_train and y_train to create a training dataset
train_data = pd.concat([X_train, y_train], axis=1)

majority_class = train_data[train_data['TARGET_B'] == 0]
minority_class = train_data[train_data['TARGET_B'] == 1]

# Upsampling the minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combining majority class with upsampled minority class
upsampled_data = pd.concat([majority_class, minority_upsampled])

# Separated features (X_upsampled) and target variable (y_upsampled) in the upsampled data
X_upsampled = upsampled_data.drop('TARGET_B', axis=1)
y_upsampled = upsampled_data['TARGET_B']



In [None]:
#We used above Upsampling the minority class in order to predict better the TARGET_B

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Random Forest Classifier on the upsampled data
clf_upsampled = RandomForestClassifier(random_state=42)
clf_upsampled.fit(X_upsampled, y_upsampled)

# Make predictions on the original test set
y_pred_upsampled = clf_upsampled.predict(X_test)

# Evaluate the model
accuracy_upsampled = accuracy_score(y_test, y_pred_upsampled)
conf_matrix_upsampled = confusion_matrix(y_test, y_pred_upsampled)
class_report_upsampled = classification_report(y_test, y_pred_upsampled)

# Display the results
print(f'Accuracy (Upsampled): {accuracy_upsampled}')
print(f'Confusion Matrix (Upsampled):\n{conf_matrix_upsampled}')
print(f'Classification Report (Upsampled):\n{class_report_upsampled}')


Accuracy (Upsampled): 0.947702143268878
Confusion Matrix (Upsampled):
[[18083    22]
 [  976     2]]
Classification Report (Upsampled):
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     18105
           1       0.08      0.00      0.00       978

    accuracy                           0.95     19083
   macro avg       0.52      0.50      0.49     19083
weighted avg       0.90      0.95      0.92     19083



#Model Predictions and Business Impact:

1.False Positives (Predicting a donor when they are not):

This could lead to unnecessary marketing expenses, efforts, and resources being allocated to individuals who are not interested in donating. The business might spend money on outreach or incentives that do not result in actual contributions.
Cost Consideration: If the cost associated with these false positives is high, it might have a significant impact on the budget and resource allocation.

2.False Negatives (Failing to identify an actual donor):

This could result in missed opportunities for fundraising. Potential donors who are not identified might not receive targeted campaigns or engagement efforts, leading to a loss in potential contributions.
Cost Consideration: If the business heavily relies on donations and the potential contribution from each donor is substantial, missing out on these opportunities could result in a significant revenue loss.
Equality of Costs:

The model's performance metrics indicate a low recall for the positive class, suggesting that it is struggling to identify actual donors. This means there is a higher likelihood of false negatives (missing actual donors).

If the cost of missing out on potential donors (false negatives) is deemed to be higher than the cost associated with targeting non-donors (false positives), the business may consider adjusting the model's threshold to increase recall, even at the expense of precision.

In [21]:
import pandas as pd

# Reset index for each dataset
X_reset = X.reset_index(drop=True)
X_upsampled_reset = X_upsampled.reset_index(drop=True)
categorical_data_reset = categorical_data.reset_index(drop=True)

# Concatenate along columns (axis=1)
concatenated_data = pd.concat([X_reset, X_upsampled_reset, categorical_data_reset], axis=1)


In [24]:
concatenated_data.head()



Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0.0,60.0,5.0,9.0,0.0,0.0,39.0,34.0,18.0,10.0,...,37.0,12.0,92.0,8.0,94.0,2.0,95.0,12.0,89.0,11.0
1,1.0,46.0,6.0,9.0,16.0,0.0,15.0,55.0,11.0,6.0,...,52.0,2.0,93.0,10.0,95.0,12.0,95.0,12.0,93.0,10.0
2,1.0,61.611649,3.0,1.0,2.0,0.0,20.0,29.0,33.0,6.0,...,0.0,2.0,91.0,11.0,92.0,7.0,95.0,12.0,90.0,1.0
3,0.0,70.0,1.0,4.0,2.0,0.0,23.0,14.0,31.0,3.0,...,28.0,1.0,87.0,11.0,94.0,11.0,95.0,12.0,87.0,2.0
4,0.0,78.0,3.0,2.0,60.0,1.0,28.0,9.0,53.0,26.0,...,20.0,1.0,93.0,10.0,96.0,1.0,96.0,1.0,79.0,3.0


In [26]:
print(concatenated_data.columns)


Index(['TCODE', 'AGE', 'INCOME', 'WEALTH1', 'HIT', 'MALEMILI', 'MALEVET',
       'VIETVETS', 'WWIIVETS', 'LOCALGOV',
       ...
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM'],
      dtype='object', length=1334)


In [19]:
# Identifying false negatives
false_negatives = (y_test == 1) & (y_pred == 0)

# Check if 'TARGET_D' is present in categorical_data
if 'TARGET_D' in categorical_data.columns:
    # Calculate potential loss (sum of TARGET_D for false negatives)
    potential_loss = categorical_data.loc[false_negatives, 'TARGET_D'].sum()
    print(f"Potential Loss: ${potential_loss:.2f}")
else:
    print("Column 'TARGET_D' not found in the dataset.")


Column 'TARGET_D' not found in the dataset.


In [None]:
print("Length of target_data['TARGET_B']:", len(target_data['TARGET_B']))
print("Length of y_pred:", len(y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)


In [None]:
print("Length of false_negatives:", len(false_negatives))
print("Length of y_pred:", len(y_pred))


In [None]:
print("Length of X_test:", len(X_test))
print("Length of y_pred:", len(y_pred))


In [None]:
print("Index of X_test:", X_test.index)
print("Index of target_data['TARGET_B']:", target_data['TARGET_B'].index)


In [None]:
#there are different lengths of the arrays from false_negatives and "Y-prediction"

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.ensemble import RandomForestClassifier


# Defining the classifier
clf = RandomForestClassifier(random_state=42)

# Defining the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Creating a scorer based on F1-score
scorer = make_scorer(f1_score)

# Initializing GridSearchCV
grid_search = GridSearchCV(clf, param_grid, scoring=scorer, cv=5)

# Fitting the model
grid_search.fit(X_train, y_train)

# Getting the best parameters
best_params = grid_search.best_params_

# Training the model with the best parameters
best_clf = RandomForestClassifier(random_state=42, **best_params)
best_clf.fit(X_train, y_train)


In imbalanced classification problems, metrics like precision, recall, and F1-score are more relevant than accuracy. 

We will train our model using GridSearchCV classification model in order to maximize the error metric.

A model can achieve high accuracy by simply predicting the majority class, without effectively capturing the minority class.

A more relevant error metric in such cases is often the F1-score, especially when you care about balancing precision and recall. The F1-score considers both false positives and false negatives, making it a suitable choice for imbalanced datasets.
