In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
np.set_printoptions(threshold=sys.maxsize)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

In [3]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
final_data = pd.read_csv("final_data.csv")

### Concatinating our data and keeping TARGET_B=1

In [4]:
data = pd.concat([numerical, categorical, targets], axis = 1)
dataD = data[data['TARGET_B']==1]
dataD

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,2,62.000000,3,8,10,2,25,40,27,11,...,88,1,94,4,96,3,87,1,1,4.0
30,0,61.611649,5,9,0,1,37,58,16,8,...,90,4,93,1,95,12,90,4,1,7.0
45,0,66.000000,5,9,5,0,33,24,39,6,...,93,12,94,4,96,2,87,4,1,5.0
78,0,69.000000,6,9,0,0,34,20,54,2,...,90,1,95,3,95,11,90,1,1,13.0
93,1,73.000000,1,7,10,0,21,53,8,5,...,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,89,6,96,1,96,1,86,8,1,20.0
95309,0,51.000000,5,6,1,1,32,43,24,7,...,93,10,94,2,95,12,93,10,1,15.0
95398,0,86.000000,5,9,0,1,32,21,26,9,...,89,6,95,11,96,2,87,11,1,3.0
95403,0,58.000000,4,9,0,0,24,46,20,6,...,90,3,93,12,96,1,90,3,1,10.0


In [5]:
XD = dataD.drop(['TARGET_D', 'TARGET_B'], axis=1)
yD = dataD['TARGET_D']
display(XD)
display(yD)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
20,2,62.000000,3,8,10,2,25,40,27,11,...,36,1,88,1,94,4,96,3,87,1
30,0,61.611649,5,9,0,1,37,58,16,8,...,0,2,90,4,93,1,95,12,90,4
45,0,66.000000,5,9,5,0,33,24,39,6,...,31,10,93,12,94,4,96,2,87,4
78,0,69.000000,6,9,0,0,34,20,54,2,...,28,7,90,1,95,3,95,11,90,1
93,1,73.000000,1,7,10,0,21,53,8,5,...,24,10,92,9,95,9,95,9,92,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,53,4,89,6,96,1,96,1,86,8
95309,0,51.000000,5,6,1,1,32,43,24,7,...,47,1,93,10,94,2,95,12,93,10
95398,0,86.000000,5,9,0,1,32,21,26,9,...,11,10,89,6,95,11,96,2,87,11
95403,0,58.000000,4,9,0,0,24,46,20,6,...,40,1,90,3,93,12,96,1,90,3


20        4.0
30        7.0
45        5.0
78       13.0
93       10.0
         ... 
95298    20.0
95309    15.0
95398     3.0
95403    10.0
95410    18.0
Name: TARGET_D, Length: 4843, dtype: float64

### Splitting for test train

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XD, yD, test_size=0.2, random_state=0)

In [7]:
X_train_cat = X_train.select_dtypes(include = object)
X_train_num =X_train.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)
X_test_num =X_test.select_dtypes(include = np.number)

### Encoding

In [8]:
# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

cols1 = encoder.get_feature_names_out(input_features=X_train_cat.columns)

X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols1)

X_train_cat_encode.reset_index(drop = True, inplace = True)
X_train_cat_encode

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3869,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [9]:
cols = encoder.get_feature_names_out(input_features=X_test_cat.columns)

X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)

X_test_cat_encode.reset_index(drop = True, inplace = True)
X_test_cat_encode

Unnamed: 0,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,...,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
966,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Normalizing the train and test data

In [13]:
transformer = MinMaxScaler().fit(X_train_num)
X_train_num_norm = transformer.transform(X_train_num)
print(X_train_num_norm.shape)
X_train_num_scale = pd.DataFrame(X_train_num_norm, index = X_train_num.index, columns=X_train_num.columns)
X_train_num_scale.head()
X_train_num_scale.reset_index(drop = True, inplace = True)

(3874, 330)


In [15]:
X_test_num_norm = transformer.transform(X_test_num)
print(X_test_num_norm.shape)
X_test_num_scale = pd.DataFrame(X_test_num_norm, index = X_test_num.index, columns=X_test_num.columns)
X_test_num_scale.head()
X_test_num_scale.reset_index(drop = True, inplace=True)

(969, 330)


In [16]:
X_train = pd.concat([X_train_num_scale,X_train_cat_encode], axis=1)
X_train.index
y_train.reset_index(drop = True, inplace = True)

In [17]:
X_test = pd.concat([X_test_num_scale,X_test_cat_encode], axis=1)
X_test
y_test.reset_index(drop = True, inplace = True)

## Using Feature Selections 

In [18]:
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm

In [19]:
X_added_constant = sm.add_constant(X_train)
X_added_constant
model = sm.OLS(y_train,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,TARGET_D,R-squared:,0.61
Model:,OLS,Adj. R-squared:,0.57
Method:,Least Squares,F-statistic:,15.52
Date:,"Mon, 26 Jun 2023",Prob (F-statistic):,0.0
Time:,22:14:43,Log-Likelihood:,-13323.0
No. Observations:,3874,AIC:,27360.0
Df Residuals:,3519,BIC:,29580.0
Df Model:,354,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.6778,4.787,1.604,0.109,-1.708,17.064
TCODE,-0.7162,4.039,-0.177,0.859,-8.636,7.203
AGE,1.3216,1.048,1.261,0.208,-0.734,3.377
INCOME,0.5473,0.590,0.928,0.353,-0.609,1.704
WEALTH1,1.1014,0.600,1.837,0.066,-0.074,2.277
HIT,-5.6384,3.692,-1.527,0.127,-12.876,1.599
MALEMILI,-2.5166,4.747,-0.530,0.596,-11.823,6.790
MALEVET,2.0194,2.192,0.921,0.357,-2.279,6.318
VIETVETS,-0.4178,1.742,-0.240,0.811,-3.834,2.998

0,1,2,3
Omnibus:,3608.267,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,713281.786
Skew:,3.854,Prob(JB):,0.0
Kurtosis:,69.026,Cond. No.,44900.0


In [20]:
drop_list_sm = ['MALEVET', 'WEALTH2', 'POP90C5', 'ETH7', 'ETH11', 'CHIL2', 'CHIL3', 'CHILC1', 'CHILC2', 'HHN1', 'HHN5', 'DW2', 'DW7', 'DW9', 'GENDER_other', 'GENDER_M', 'MAXRAMNT', 'HC16', 'HC13', 'HC11', 'HC6', 'ANC3']

In [21]:
# Use variance threshold to remove low-variance features
vt = VarianceThreshold(threshold=0.02)
X_vt = vt.fit(X_train)
X_vt_transform = vt.transform(X_train)

var_list = list(X_vt.get_support())
list(zip(X_train.columns, var_list))
[col[0] for col in zip(X_train.columns, var_list) if col[1] == False]

removed_columns_vt = pd.DataFrame(data=(X_train.columns,X_vt.variances_,X_vt.get_support()), index=('column_name','variance','statement')).T
removed_columns_vt = removed_columns_vt.loc[(removed_columns_vt['statement'] == False),:]
drop_list_vt = list(removed_columns_vt['column_name'])

In [22]:
final_drop = drop_list_vt + drop_list_sm

In [23]:
#Removing from  axis the columns that the p-value is high
X_train_final= X_train.drop(columns = final_drop)

In [24]:
X_test_final = X_test.drop(columns = final_drop)

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

In [27]:
model1 = SGDRegressor()
model2 = LinearRegression()
model3 = KNeighborsRegressor()

model_pipeline = [model1, model2, model3]
model_names = ['SGDRegressor', 'Linear Regression', 'KNN']
scores_train = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores_train[model_name] = mean_score
print(scores_train)

{'SGDRegressor': 0.4013417005273029, 'Linear Regression': 0.3820665682282203, 'KNN': 0.14122230927936832}
