## INTRO TO MACHINE LEARNING

In [1]:
import pandas as pd

melb_file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
melb_data = pd.read_csv(melb_file_path)
melb_data.describe()
list(melb_data.columns)
filtered_melb_data = melb_data.dropna(axis=0)

In [2]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

y = filtered_melb_data.Price
melb_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melb_data[melb_features]

melb_model = DecisionTreeRegressor(random_state=1)
melb_model.fit(X,y)

print("Making prediction for the following 5 houses:")
print(X.head())
print("\nthe prediction are: ")
print(melb_model.predict(X.head()))

predicted_home_prices = melb_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

# this kind of modeling has a problem. because we've used the same data 
# for fitting our model and evaluating our model. 
# we need to split our datas to two different part, one for fitting or 
# trainnig data and another part for evaluating how good our model works. 
# in the next cell we go through that. 

Making prediction for the following 5 houses:
   Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude
1      2       1.0     156.0          79.0     1900.0   -37.8079    144.9934
2      3       2.0     134.0         150.0     1900.0   -37.8093    144.9944
4      4       1.0     120.0         142.0     2014.0   -37.8072    144.9941
6      3       2.0     245.0         210.0     1910.0   -37.8024    144.9993
7      2       1.0     256.0         107.0     1890.0   -37.8060    144.9954

the prediction are: 
[1035000. 1465000. 1600000. 1876000. 1636000.]


434.71594577146544

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y =train_test_split(X, y, random_state= 0)
melb_model.fit(train_X, train_y)

val_predictions = melb_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

# we splitted our data and we can see how much our mae changed. 
# it's true that the last one has really low mae but it is not true
# this one say us more trues about how our model is good. 

262494.3027759845


In [6]:
# Overfitting and underfitting: 
# these two terms related to how deep our tree is. 
# and the deepest tree has more leaf, so we need to choose the best 
# number for the leaves: 

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds_val)
    return(mae)

my_mae = list()
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae.append(get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y))




[347380.33833344496,
 258171.21202406782,
 243495.96361790417,
 254983.64299548094]

In [7]:
# Randon Forest: 
# in decission tree we use only one tree and that tree may be over 
# or under fitted. but here we use a lot of tree and make prediction 
# based on that. 

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

191669.7536453626


## Intermediate Machine Learning

In [16]:
# reading the data 
#and defineing trainning and val and test data: 

import pandas as pd
from sklearn.model_selection import train_test_split

# reading the data: 
file_path = '/home/vahid/Documents/ML_Learning/train.csv'
X_full = pd.read_csv(file_path, index_col='Id')
X_test_full = pd.read_csv(file_path, index_col='Id')

# obtain target(dependent variable) and predictors(independent variables):
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_test_full[features].copy()

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

X_train.head()

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
619,11694,2007,1828,0,2,3,9
871,6600,1962,894,0,1,2,5
93,13360,1921,964,0,1,2,5
818,13265,2002,1689,0,2,3,7
303,13704,2001,1541,0,2,3,6


In [17]:
# here we use randomForest model
# to has prediction about future datas. 

from sklearn.ensemble import RandomForestRegressor

# we defining several model of RandomForest with different parameters
# to find the best sets of parameters: 

# like decission tree, here we may face over and under fitting
# so we change our parameters to find the best. 
# n_stimators give us the number of tree in the forest. 

model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]


from sklearn.metrics import mean_absolute_error

# Function for comparing different models
# notice: here in the parameters of the function, we put special 
# amount of data to each of the X_t, X_v , ... 
# so, in the future for calling the function we dont need to define 
# these values, we only call function with parameters of model. 

def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))
    
    
# so, untill now we find the best parameters for randomforest model. 
# now, we have the best model to predict the future data. 
# now, we fit our model with the whole data, X, y 
# here I have a question??? do we need fit our model with the whole data? 
# ????????????????????????????????????????
# in my_model2 I fit the model with training data.
#it give the same results. 

my_model = model_3
my_model.fit(X, y)

my_model2 = model_3
my_model2.fit(X_train, y_train)

# Generate test predictions
preds_test = my_model.predict(X_test)
preds_test2 = my_model2.predict(X_test)

# finding out how good our model is by calculating R2
# but I have the previous problem yet, 
# is it correct to fit model with the whole data???????????
# ?????????????????????????????????????????????????????????
from sklearn.metrics import r2_score

r2 = r2_score(y, preds_test)
r22 = r2_score(y, preds_test2)

print('R2 for fitting model with the whole data = ',r2)
print('R2 for fitting model with training data = ',r22)


Model 1 MAE: 24015
Model 2 MAE: 23740


  warn(


Model 3 MAE: 23528
Model 4 MAE: 23996
Model 5 MAE: 23706


  warn(
  warn(


R2 for fitting model with the whole data =  0.9332040782440739
R2 for fitting model with training data =  0.9332040782440739


In [49]:
## Missing values: 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load the data
file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
data = pd.read_csv(file_path)

# Select target
y = data.Price

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


# Function for comparing different approaches;
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


## Approach 1 (Drop Columns with Missing Values):

# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
print('columns with missing values: ',cols_with_missing) 

# dropping columns in training and validation data: 
# be carefull to drop columns in both training and validation data: 
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("\nMAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))



## Approach 2 (Imputation):
from sklearn.impute import SimpleImputer

# Imputation:
# we use SimpleImputer to replace missing values..
# with the mean value along each column.
my_imputer = SimpleImputer()

# SimpleImputer() give us result in numpy.ndarray
# so we use pd.DataFrame to have pandas format of datas. 
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
# now we need transform, because we've fitted our data in previous step.
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("\nMAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))



## Approach 3 (An Extension to Imputation):
# we impute the missing values,...
# while also keeping track of which values were imputed.

# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation:
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("\nMAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))





columns with missing values:  ['Car', 'BuildingArea', 'YearBuilt']

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635

MAE from Approach 2 (Imputation):
178166.46269899711

MAE from Approach 3 (An Extension to Imputation):
178927.503183954


In [57]:
## Categorical Variables: 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## preaparing data: 
# Load the data
file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
data = pd.read_csv(file_path)
# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique()<10 and X_train_full[cname].dtype=="object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
# Get list of categorical variables
s = (X_train.dtypes =='object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


## Three Aproach of dealing with Categorical Variables: 

#Approach 1 (Drop Categorical Variables):
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
print("\nMAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))


#Approach 2 (Ordinal Encoding):
from sklearn.preprocessing import OrdinalEncoder
# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
# Apply ordinal encoder to each column with categorical data:
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
print("\nMAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))


#Approach 3 (One-Hot Encoding):
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))



Categorical variables:
['Type', 'Method', 'Regionname']

MAE from Approach 1 (Drop categorical variables):
175703.48185157913

MAE from Approach 2 (Ordinal Encoding):
165936.40548390493
MAE from Approach 3 (One-Hot Encoding):




166089.4893009678




In [59]:
## Pipelines:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## preaparing data: 
# Load the data
file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
data = pd.read_csv(file_path)
# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)
# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

## Step 1: Define Preprocessing Steps:
# Preprocessing for numerical data: 
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data: 
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Bundle preprocessing for numerical and categorical data:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),('cat', categorical_transformer, categorical_cols)])
## Step 2: Define the Model: 
model = RandomForestRegressor(n_estimators=100, random_state=0)
## Step 3: Create and Evaluate the Pipeline: 
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 160679.18917034855


In [3]:
## Cross-Validation:

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## preaparing data: 
# Load the data
file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
data = pd.read_csv(file_path)
# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]
# Select target
y = data.Price

# defining a pipeline for preprocessing and modeling: 
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()), ('model', RandomForestRegressor(n_estimators=50, random_state=0))])

# cross-validation process: 
from sklearn.model_selection import cross_val_score
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores)
### NOTICE: 
# Scikit-learn has a convention where all metrics are defined...
# so a high number is better. Using negatives here allows them...
# to be consistent with that convention,...
# though negative MAE is almost unheard of elsewhere....

# we want a single measure of model quality so: 
print("\nAverage MAE score (across experiments):")
print(scores.mean())

### IMPORTANT NOTICE: 
# Using cross-validation yields a much better measure of model quality,
# with the added benefit of cleaning up our code:
## note that we no longer need to keep track of separate
# training and validation sets.
# So, especially for small datasets, it's a good improvement!


MAE scores:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]

Average MAE score (across experiments):
277707.3795913405


In [2]:
## XGBoost: 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## preaparing data: 
# Load the data
file_path = '/home/vahid/Documents/ML_Learning/melb_data.csv'
data = pd.read_csv(file_path)
# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]
# Select target
y = data.Price
# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

##XGBoost model: 
from xgboost import XGBRegressor
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))




Mean Absolute Error: 234738.34581645802


In [4]:
## Data Leakage:
# There are two main types of leakage:
# target leakage and train-test contamination.
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Read the data
data_path = '/home/vahid/Documents/ML_Learning/AER_credit_card_data.csv'
data = pd.read_csv(data_path, true_values=['yes'], false_values=['no'])
# Select target
y = data.card
# Select predictors
X = data.drop(['card'], axis=1)
print("Number of rows in the dataset:", X.shape[0])
X.head()

# Since there is no preprocessing, we don't need a pipeline
# (used anyway as best practice!)
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')
print("Cross-validation accuracy: %f" % cv_scores.mean())

expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]

print('Fraction of those who did not receive a card and had no expenditures: %.2f' \
      %((expenditures_noncardholders == 0).mean()))
print('Fraction of those who received a card and had no expenditures: %.2f' \
      %(( expenditures_cardholders == 0).mean()))

# Drop leaky predictors from dataset
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)

# Evaluate the model with leaky predictors removed
cv_scores = cross_val_score(my_pipeline, X2, y, 
                            cv=5,
                            scoring='accuracy')

print("Cross-val accuracy: %f" % cv_scores.mean())

Number of rows in the dataset: 1319
Cross-validation accuracy: 0.980294
Fraction of those who did not receive a card and had no expenditures: 1.00
Fraction of those who received a card and had no expenditures: 0.02
Cross-val accuracy: 0.832446


In [2]:
from sklearn.metrics import mean_absolute_error

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_absolute_error(y_true, y_true)

0.0