# From Previous Lesson

In [7]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

In [8]:
#this data has been treated with some log transformations like we have seen yesterday
data = pd.read_csv('regression_data_treated.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,9.510467,17.343389,4.181353,Male,21.0
1,3.08,6.45859,16.230984,4.150313,Male,3.0
2,7.5,8.780623,18.047227,4.205057,Female,20.0
3,6.7,6.200492,11.73711,4.055333,Male,5.0
4,8.785714,7.477123,12.494862,4.088969,Female,10.0


In [9]:
#separate the features from the labels
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)

In [10]:
X.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender
0,15.5,9.510467,17.343389,4.181353,Male
1,3.08,6.45859,16.230984,4.150313,Male
2,7.5,8.780623,18.047227,4.205057,Female
3,6.7,6.200492,11.73711,4.055333,Male
4,8.785714,7.477123,12.494862,4.088969,Female


In [11]:
#categorical features and numerical ones are going to be treated differently
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

In [12]:
# Normalizing and Standardizing data - only for numerical variables

In [13]:
X_num.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AVGGIFT,4670.0,11.705325,10.097916,2.26087,7.125938,10.0,14.333333,450.0
HV1_log,4670.0,8.036391,1.269845,5.177302,7.233936,7.842845,8.60508,24.406461
IC1_transformed,4670.0,15.67538,2.239342,7.707891,14.07158,15.615699,17.238244,20.892798
IC5_transformed,4670.0,4.178176,0.052677,3.91319,4.143215,4.177919,4.211933,4.402811


In [14]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_normalized = MinMaxtransformer.transform(X_num)
print(type(X_normalized))
X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)
display(X_normalized.head())
print(type(X_normalized))

<class 'numpy.ndarray'>


Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed
0,0.029569,0.225343,0.730798,0.547694
1,0.001829,0.066633,0.646428,0.4843
2,0.011701,0.187388,0.78418,0.596107
3,0.009915,0.05321,0.305593,0.290313
4,0.014573,0.119601,0.363064,0.35901


<class 'pandas.core.frame.DataFrame'>


In [15]:
X_normalized.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AVGGIFT,4670.0,0.021094,0.022553,0.0,0.010866,0.017285,0.026963,1.0
HV1_log,4670.0,0.148685,0.066037,0.0,0.106954,0.13862,0.178259,1.0
IC1_transformed,4670.0,0.604289,0.169841,0.0,0.482649,0.599762,0.722823,1.0
IC5_transformed,4670.0,0.541205,0.107588,0.0,0.469801,0.54068,0.610151,1.0


In [16]:
## alternatively you could use StandardScaler:
# from sklearn.preprocessing import StandardScaler
# Standardtransformer = StandardScaler().fit(X_num)
# X_standardized = Standardtransformer.transform(X_num)
# print(X_standardized.shape)
# X_standardized = pd.DataFrame(X_standardized,columns=X_num.columns)
# X_standardized.head()
# X_standardized.describe().T

# Lesson 1.09

In [17]:
# Activity 1
# Discussion on categorical variables: how would you currently handle categories? Should you have many or few?

# Lesson 1 : categorical variables

In [18]:
X_cat = data.select_dtypes(include = object)

In [19]:
# in this case there is only one categorical variable
X_cat.head()

Unnamed: 0,gender
0,Male
1,Male
2,Female
3,Male
4,Female


In [20]:
X_cat['gender'].value_counts()

Female    2664
Male      1895
U          111
Name: gender, dtype: int64

In [21]:
#one hot encoding is a way to turn categorical variables into multiple numerical columns
from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder().fit(X_cat)
encoder = OneHotEncoder(drop='first').fit(X_cat)
print(encoder.categories_)
encoded = encoder.transform(X_cat).toarray()
print(encoded)
#onehot_encoded = pd.DataFrame(encoded,columns=['Female', 'Male','U'])
onehot_encoded = pd.DataFrame(encoded,columns=['Male', 'U'])
onehot_encoded.head(20)

[array(['Female', 'Male', 'U'], dtype=object)]
[[1. 0.]
 [1. 0.]
 [0. 0.]
 ...
 [0. 0.]
 [1. 0.]
 [1. 0.]]


Unnamed: 0,Male,U
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
5,1.0,0.0
6,0.0,0.0
7,1.0,0.0
8,0.0,0.0
9,0.0,0.0


In [22]:
#because one of the variables can be deduced from the others, no point in keeping all these columns around
# onehot_encoded = onehot_encoded.drop(['Female'],axis=1)
# onehot_encoded.head()

In [23]:
#label encoding keeps just one column and makes it numerical, but watch out: you may be introducing unintended semantics
#if you want to control which number gets assigned to which category, you can use OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
label_encoded = LabelEncoder().fit(X_cat).transform(X_cat) # ordered wrt value counts
label_encoded = pd.DataFrame(label_encoded,columns=X_cat.columns)
display(label_encoded.head(20))
label_encoded['gender'].value_counts()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,gender
0,1
1,1
2,0
3,1
4,0
5,1
6,0
7,1
8,0
9,0


0    2664
1    1895
2     111
Name: gender, dtype: int64

In [24]:
# also look at OrdinalEncoder

In [25]:
# Activity 2
# do it yourself. Which one do you think is more appropriate here?

# Lesson 2 : train-test split

In [26]:
# let's merge all this information together into a single dataset with all features, now numerical
X = pd.concat([X_normalized, onehot_encoded], axis=1)  # np.concatenate()

In [27]:
X.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,Male,U
0,0.029569,0.225343,0.730798,0.547694,1.0,0.0
1,0.001829,0.066633,0.646428,0.4843,1.0,0.0
2,0.011701,0.187388,0.78418,0.596107,0.0,0.0
3,0.009915,0.05321,0.305593,0.290313,1.0,0.0
4,0.014573,0.119601,0.363064,0.35901,0.0,0.0


In [28]:
y = data['TARGET_D']
y.head()

0    21.0
1     3.0
2    20.0
3     5.0
4    10.0
Name: TARGET_D, dtype: float64

In [29]:
# train test split is the way ML generates its claim to fame: 
# we build the model on a portion of the data but we then validate it in 
# another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values 
# in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3736, 6)
(934, 6)
(3736,)
(934,)


In [31]:
X_train.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,Male,U
2858,0.013711,0.122383,0.78418,0.596107,0.0,0.0
4238,0.012499,0.126146,0.63344,0.5625,1.0,0.0
831,0.011887,0.147172,0.826925,0.776779,1.0,0.0
2285,0.013252,0.112634,0.504109,0.570522,0.0,0.0
1417,0.009468,0.175662,0.800766,0.604974,1.0,0.0


In [32]:
y_train.head()

2858     5.0
4238    10.0
831     10.0
2285    11.0
1417    10.0
Name: TARGET_D, dtype: float64

In [33]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)



In [34]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.8617916442966341

In [35]:
# But now we evaluate it in the TEST portion of the data, that we did not use for training.
# This way we know our model is genuinely guessing our donations, not just repeating the values it has seen in the training data


predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

0.7835732034742453

In [36]:
y_test[:5]

1714    11.0
2481    12.0
3646    21.0
4139     6.0
4062    30.0
Name: TARGET_D, dtype: float64

In [37]:
predictions_test[:5]

array([15.04394174, 11.45137521, 20.78477349,  9.73320677, 32.5016748 ])

In [38]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,predictions_test)
mse

20.79009922350801

In [39]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

4.559616126770763

In [40]:
y_test.mean()

14.877537473233403

In [41]:
# to make predictions on the new data, 
# we have to process the data (X features) in the same way. 

In [42]:
# Activity 3a
# take the file "data_to_predict" and find out how much those donors could be worth
# watch out: is the data in the same format as what your training data currently looks like?
data_for_p = pd.read_csv('data_to_predict.csv')
display(data_for_p.head())

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender
0,8.714286,7.209011,17.666103,4.166268,Male
1,16.5,9.317831,17.35831,4.22462,Male
2,15.1,8.402447,14.627447,4.186322,Male
3,4.848485,6.440114,12.270634,4.144954,Female
4,19.0,8.607769,17.977822,4.280958,Female


In [52]:
X_num1 = X.select_dtypes(include = np.number)
X_cat1 = X.select_dtypes(include = object)

In [54]:
X_num1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AVGGIFT,4670.0,0.021094,0.022553,0.0,0.010866,0.017285,0.026963,1.0
HV1_log,4670.0,0.148685,0.066037,0.0,0.106954,0.13862,0.178259,1.0
IC1_transformed,4670.0,0.604289,0.169841,0.0,0.482649,0.599762,0.722823,1.0
IC5_transformed,4670.0,0.541205,0.107588,0.0,0.469801,0.54068,0.610151,1.0
Male,4670.0,0.405782,0.491095,0.0,0.0,0.0,1.0,1.0
U,4670.0,0.023769,0.152344,0.0,0.0,0.0,0.0,1.0


In [63]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer1 = MinMaxScaler().fit(X_num1)
X_normalized1 = MinMaxtransformer.transform(X_num1)
print(type(X_normalized1))
X_normalized1 = pd.DataFrame(X_normalized1,columns=X_num1.columns)
display(X_normalized1.head())
print(type(X_normalized1))

<class 'numpy.ndarray'>


Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,Male,U
0,0.029569,0.225343,0.730798,0.547694,1.0,0.0
1,0.001829,0.066633,0.646428,0.4843,1.0,0.0
2,0.011701,0.187388,0.78418,0.596107,0.0,0.0
3,0.009915,0.05321,0.305593,0.290313,1.0,0.0
4,0.014573,0.119601,0.363064,0.35901,0.0,0.0


<class 'pandas.core.frame.DataFrame'>


In [62]:
X_normalized1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AVGGIFT,4670.0,0.021094,0.022553,0.0,0.010866,0.017285,0.026963,1.0
HV1_log,4670.0,0.148685,0.066037,0.0,0.106954,0.13862,0.178259,1.0
IC1_transformed,4670.0,0.604289,0.169841,0.0,0.482649,0.599762,0.722823,1.0
IC5_transformed,4670.0,0.541205,0.107588,0.0,0.469801,0.54068,0.610151,1.0
Male,4670.0,0.405782,0.491095,0.0,0.0,0.0,1.0,1.0
U,4670.0,0.023769,0.152344,0.0,0.0,0.0,0.0,1.0


In [65]:
X_cat1 = data_for_p.select_dtypes(include = object)
X_cat1.head(20)

Unnamed: 0,gender
0,Male
1,Male
2,Male
3,Female
4,Female
5,Male
6,Female
7,Female
8,Female
9,Male


In [66]:
X_cat1['gender'].value_counts()

Male      16
Female    14
U          1
Name: gender, dtype: int64

In [70]:
data1 = data_for_p.drop(labels=['U'], axis=0)

KeyError: "['U'] not found in axis"

In [44]:
#making the new data look like the transformed one may not be the most immediate exercise...

#we make the same separation into numerical and categorical
X_for_p_num = data_for_p.select_dtypes(include = np.number)
X_for_p_cat = data_for_p.select_dtypes(include = object)



In [45]:
X_for_p_cat.head()

Unnamed: 0,gender
0,Male
1,Male
2,Male
3,Female
4,Female


In [46]:
X_for_p_cat.shape

(31, 1)

In [47]:
X_for_p_cat['gender'].value_counts()

Male      16
Female    14
U          1
Name: gender, dtype: int64

In [48]:
# for normalization and one hot encoding we need to make sure we remain consisten with the training data:
# If we MinMax scale just this piece of data, a "1" in this dataset means something very different from a 1 in the original dataset
# If we one-hot-encode in this dataset, the order of the columns (Unknown, Male, Female) may turn out different
#to avoid this, we use the same transformers we had already defined before, we do not fit them again.
encoded_for_p = encoder.transform(X_for_p_cat).toarray()
encoded_for_p
encoder.categories_
onehot_encoded_for_p = pd.DataFrame(encoded_for_p)
# onehot_encoded_for_p = onehot_encoded_for_p.drop(['Female'],axis=1)
onehot_encoded_for_p.head()

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,0.0,0.0
4,0.0,0.0


In [49]:
X_for_p_normalized = MinMaxtransformer.transform(X_for_p_num)
X_for_p_normalized = pd.DataFrame(X_for_p_normalized,columns=X_for_p_num.columns)

#merge back all of our labels
X_for_p = pd.concat([X_for_p_normalized, onehot_encoded_for_p], axis=1)

X_for_p.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,0,1
0,0.014413,0.105658,0.755274,0.516885,1.0,0.0
1,0.031802,0.215326,0.731929,0.636063,1.0,0.0
2,0.028675,0.167722,0.524809,0.557843,1.0,0.0
3,0.005779,0.065672,0.346058,0.473354,0.0,0.0
4,0.037386,0.178399,0.778916,0.751127,0.0,0.0


In [50]:
X_for_p.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AVGGIFT,31.0,0.020204,0.012278,0.004443,0.010554,0.018029,0.025102,0.050787
HV1_log,31.0,0.134827,0.04695,0.065672,0.099711,0.122678,0.161922,0.280329
IC1_transformed,31.0,0.596869,0.13713,0.343892,0.49345,0.578653,0.69355,0.967862
IC5_transformed,31.0,0.538942,0.101441,0.387075,0.454441,0.526007,0.618455,0.751127
0,31.0,0.516129,0.508001,0.0,0.0,1.0,1.0,1.0
1,31.0,0.032258,0.179605,0.0,0.0,0.0,0.0,1.0


In [51]:
#predict and inspect results
results_for_p = lm.predict(X_for_p)

pd.concat([data_for_p,pd.Series(results_for_p, name='estimate')],axis=1).head()

TypeError: Feature names are only supported if all input features have string names, but your input has ['int', 'str'] as feature name / column name types. If you want feature names to be stored and validated, you must convert them all to strings, by using X.columns = X.columns.astype(str) for example. Otherwise you can remove feature / column names from your input data, or convert them all to a non-string data type.

# Lesson 3 : metrics

In [71]:
from sklearn.metrics import mean_absolute_error, r2_score

In [72]:
mae = mean_absolute_error(y_test, predictions_test)
print(mae)

3.434956878623416


In [73]:
rmse = math.sqrt(mse)
print(rmse)

4.559616126770763


In [74]:
r2 = r2_score(y_test, predictions_test)
r2

0.7835732034742453

In [None]:
################# ----------------------- ##################
# below are some code snippets that I have used in the past to demonstrate some functions 
# related to encoder. We'll probably not use these today.
df = pd.DataFrame([['a','c'],['b','d'],['a','e']],columns=['col1','col2'])

In [None]:
df

In [None]:
encoder = OneHotEncoder().fit(df)
encoded_for_p = encoder.transform(df).toarray()
encoded_for_p
display(encoder.categories_)
cols = [colname for row in encoder.categories_ for colname in row]
display(cols)
onehot_encoded_for_p = pd.DataFrame(encoded_for_p, columns=cols)
onehot_encoded_for_p
cols_to_drop = [row[0] for row in encoder.categories_]
cols_to_drop
onehot_encoded_for_p = onehot_encoded_for_p.drop(cols_to_drop,axis=1)
onehot_encoded_for_p.head()

In [None]:
cols=[]
for row in encoder.categories_:
    for colname in row:
        cols.append(colname)
cols

In [None]:
# even simpler, without the list comprehensions:
encoder2 = OneHotEncoder(drop='first').fit(df)
encoded_for_p2 = encoder2.transform(df).toarray()
encoded_for_p2
cols = encoder2.get_feature_names(input_features=df.columns)
cols
# # Note: in version 1.0 and higher of sklearn this method is called 'get_feature_names_out()'
# #cols
onehot_encoded_for_p2 = pd.DataFrame(encoded_for_p2, columns=cols)
onehot_encoded_for_p2.head()