In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

data = pd.concat([categorical,numerical,target],axis=1)

In [3]:
data.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.8125,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0


In [4]:
data_reg = data[data['TARGET_B']==1]

In [5]:
data_reg.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
20,other,12,H,F,3,L,D,A,S,1,...,7.0,5.0,12,4.066667,82943,1,3,3,1,4.0
30,TX,35,H,M,3,L,D,A,T,1,...,10.0,7.0,9,6.181818,190313,1,3,14,1,7.0
45,other,24,H,F,3,L,D,C,C,1,...,6.0,5.0,3,4.857143,76585,1,3,11,1,5.0
78,CA,13,H,F,2,L,F,A,S,1,...,17.0,10.0,21,11.0,156378,0,2,2,1,13.0
93,GA,18,H,M,3,L,E,A,S,2,...,12.0,12.0,6,9.4,25641,1,3,22,1,10.0


In [6]:
# X,y split
X = data_reg.drop(['TARGET_B','TARGET_D'],axis=1)
y = data_reg['TARGET_D']
                   

In [7]:
# train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
test_indices = X_test.index

In [8]:
# scaling and numerical variables and encoder categorical variables
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(object)
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(object)

#Scale X_train and X_test
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(X_train_num)
X_scaled_train = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)
X_scaled_test = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)

#Onehot Encode X_train and X_test
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
encoded_train = encoder.transform(X_train_cat).toarray()
encoded_train = pd.DataFrame(encoded_train,columns=encoder.get_feature_names_out())
encoded_test = encoder.transform(X_test_cat).toarray()
encoded_test = pd.DataFrame(encoded_test,columns=encoder.get_feature_names_out())

In [9]:
train_scaled = pd.concat([encoded_train,X_scaled_train],axis=1)
test_scaled = pd.concat([encoded_test,X_scaled_test],axis=1)

In [10]:
train_scaled.head()

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.275862,0.006667,0.008151,0.029133,0.045455,0.003731,0.979851,0.0,0.0,0.262295
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.137931,0.022222,0.01407,0.042222,0.068182,0.02909,0.676623,0.0,0.333333,0.065574
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.034483,0.055556,0.020101,0.055556,0.204545,0.050787,0.728942,1.0,0.0,0.245902
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.310345,0.011111,0.013065,0.04,0.295455,0.017285,0.960838,1.0,0.333333,0.557377
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.034483,0.011111,0.011055,0.035556,0.204545,0.015498,0.934796,1.0,0.0,0.295082


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

DT = DecisionTreeRegressor()
LR = LinearRegression()
RF = RandomForestRegressor()

model_pipeline = [DT, LR, RF]
model_names = ['Regression Tree', 'Linear Regression','Random Forest']
scores = {}
i=0

for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, train_scaled, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)


{'Regression Tree': -0.07849109770520415, 'Linear Regression': 0.3820665682282217, 'Random Forest': 0.501180119954465}


In [12]:
val_scores = {}
i=0

test_scaled.columns = test_scaled.columns.astype(str)
for model in model_pipeline:
    model.fit(train_scaled, y_train)
    val_scores[model_names[i]] = model.score(test_scaled,y_test)
    i = i+1
print(val_scores)

{'Regression Tree': 0.1666891626371021, 'Linear Regression': 0.2895529501343259, 'Random Forest': 0.41192597740823356}


In [13]:
# using the best performing model Random Forest
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

LR =RandomForestRegressor().fit(train_scaled, y_train)
pred = LR.predict(test_scaled)

In [14]:
# evaluation the score
r2 = r2_score(y_test,pred)
mse = mean_squared_error(y_test,pred)
mae = mean_absolute_error(y_test,pred)
rmse = sqrt(mse)
print("R-squared:", r2, "MSE:", mse, "MAE:", mae, "RMSE:", rmse)


R-squared: 0.4151128256377131 MSE: 111.45883348256966 MAE: 4.778733642930856 RMSE: 10.557406569919038


In [15]:
# Adding predictions to dataframe
data['Predictions'] = np.nan  # Initialize the column with NaN
data.loc[test_indices, 'Predictions'] = pred  # Add predictions


In [17]:
data

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D,Predictions
0,IL,36,H,F,3,L,E,C,T,2,...,10.0,4,7.741935,95515,0,4,39,0,0.0,
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,18,15.666667,148535,0,2,1,0,0.0,
2,NC,43,U,M,3,L,E,C,R,2,...,5.0,12,7.481481,15078,1,4,60,0,0.0,
3,CA,44,U,F,3,L,E,C,R,2,...,10.0,9,6.812500,172556,1,4,41,0,0.0,
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,14,6.864865,7112,1,2,26,0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,other,27,H,M,3,L,G,C,C,2,...,25.0,9,25.000000,184568,0,1,12,0,0.0,
95408,TX,24,H,M,3,L,F,A,C,1,...,20.0,9,20.000000,122706,1,1,2,0,0.0,
95409,MI,30,H,M,3,L,E,B,C,3,...,10.0,3,8.285714,189641,1,3,34,0,0.0,
95410,CA,24,H,F,2,L,F,A,C,1,...,18.0,4,12.146341,4693,1,4,11,1,18.0,


In [16]:
# how much will the donators give?
data['Predictions'].mean()

16.2886737874097