# From Previous Lesson

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import linear_model

In [2]:
#this data has been treated with some log transformations like we have seen yesterday
data = pd.read_csv('regression_data_treated.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,9.510467,17.343389,4.181353,Male,21.0
1,3.08,6.45859,16.230984,4.150313,Male,3.0
2,7.5,8.780623,18.047227,4.205057,Female,20.0
3,6.7,6.200492,11.73711,4.055333,Male,5.0
4,8.785714,7.477123,12.494862,4.088969,Female,10.0


In [None]:
#separate the features from the labels
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)

In [None]:
X.head()

In [None]:
#categorical features and numerical ones are going to be treated differently
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

In [None]:
X_cat['gender'].unique()

In [None]:
# Normalizing and Standardizing data - only for numerical variables

In [None]:
X_num.describe().T

In [None]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_normalized = MinMaxtransformer.transform(X_num)
print(X_normalized.shape)
X_normalized = pd.DataFrame(X_normalized,columns=X_num.columns)
X_normalized.head()

In [None]:
X_normalized.describe().T

In [None]:
from sklearn.preprocessing import StandardScaler
Standardtransformer = StandardScaler().fit(X_num)
X_standardized = Standardtransformer.transform(X_num)
print(X_standardized.shape)
X_standardized = pd.DataFrame(X_standardized,columns=X_num.columns)
X_standardized.head()

In [None]:
X_standardized.describe().T

# Lesson 1.08

In [None]:
# Activity 1
# Discussion on categorical variables: how would you currently handle categories? Should you have many or few?

# Lesson 1 : categorical variables

In [None]:
X_cat = data.select_dtypes(include = object)

In [None]:
# in this case there is only one categorical variable
X_cat.head()

In [None]:
X_cat['gender'].value_counts()

In [None]:
#one hot encoding is a way to turn categorical variables into multiple numerical columns
from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder().fit(X_cat)
encoder = OneHotEncoder(drop='first').fit(X_cat)
print(encoder.categories_)
encoded = encoder.transform(X_cat).toarray()
print(encoded)
#onehot_encoded = pd.DataFrame(encoded,columns=['Female', 'Male','U'])
onehot_encoded = pd.DataFrame(encoded,columns=['Male', 'U'])
onehot_encoded.head(20)

In [None]:
#because one of the variables can be deduced from the others, no point in keeping all these columns around
# onehot_encoded = onehot_encoded.drop(['Female'],axis=1)
# onehot_encoded.head()

In [None]:
#label encoding keeps just one column and makes it numerical, but watch out: you may be introducing unintended semantics
#if you want to control which number gets assigned to which category, you can use OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
label_encoded = LabelEncoder().fit(X_cat).transform(X_cat) # ordered wrt value counts
label_encoded = pd.DataFrame(label_encoded,columns=X_cat.columns)
display(label_encoded.head(20))
label_encoded['gender'].value_counts()

In [None]:
# Activity 2
# do it yourself. Which one do you think is more appropriate here?

# Lesson 2 : train-test split

In [None]:
# let's merge all this information together into a single dataset with all features, now numerical
X = pd.concat([X_normalized, onehot_encoded], axis=1)  # np.concatenate()

In [None]:
X.head()

In [None]:
y = data['TARGET_D']
y.head()

In [None]:
# train test split is the way ML generates its claim to fame: 
# we build the model on a portion of the data but we then validate it in 
# another "fresh" portion
# our model has no opportunity to "cheat": it must accurately guess the values 
# in the "fresh" dataset that it never saw before
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
#we train/fit our model like yesterday
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)



In [None]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
# But now we evaluate it in the TEST portion of the data, that we did not use for training.
# This way we know our model is genuinely guessing our donations, not just repeating the values it has seen in the training data


predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
y_test[:5]

In [None]:
predictions_test[:5]

In [None]:
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

In [None]:
y_test.mean()

In [None]:
# to make predictions on the new data, 
# we have to process the data (X features) in the same way. 

In [None]:
# Activity 3a
# take the file "data_to_predict" and find out how much those donors could be worth
# watch out: is the data in the same format as what your training data currently looks like?


In [None]:
#making the new data look like the transformed one may not be the most immediate exercise...




# Lesson 3 : metrics

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
mae = mean_absolute_error(y_test, predictions_test)
print(mae)

In [None]:
rmse = math.sqrt(mse)
print(rmse)

In [None]:
r2 = r2_score(y_test, predictions_test)
r2

In [None]:

df = pd.DataFrame([['a','c'],['b','d'],['a','e']],columns=['col1','col2'])

In [None]:
df

In [None]:
encoder = OneHotEncoder().fit(df)
encoded_for_p = encoder.transform(df).toarray()
encoded_for_p
display(encoder.categories_)
cols = [colname for row in encoder.categories_ for colname in row]
display(cols)
onehot_encoded_for_p = pd.DataFrame(encoded_for_p, columns=cols)
onehot_encoded_for_p
cols_to_drop = [row[0] for row in encoder.categories_]
cols_to_drop
onehot_encoded_for_p = onehot_encoded_for_p.drop(cols_to_drop,axis=1)
onehot_encoded_for_p.head()

In [None]:
cols=[]
for row in encoder.categories_:
    for colname in row:
        cols.append(colname)
cols

In [None]:
# even simpler, without the list comprehensions:
encoder2 = OneHotEncoder(drop='first').fit(df)
encoded_for_p2 = encoder2.transform(df).toarray()
encoded_for_p2
cols = encoder2.get_feature_names(input_features=df.columns)
cols
# # Note: in version 1.0 and higher of sklearn this method is called 'get_feature_names_out()'
# #cols
onehot_encoded_for_p2 = pd.DataFrame(encoded_for_p2, columns=cols)
onehot_encoded_for_p2.head()