# Explore and prepare the data

In [None]:
# part after data wrangling. 
# goal: Transform our datasets to have its ready for the algorithms that are going to learn from the data. 
# goal of the dataset: We want to predict if the passenger will survive or not based on the other variables. 

    # we focus on classification problem but all of these is valid for work with other algorithms. 
    
# we use the titanic dataset https://moodle.upm.es/titulaciones/oficiales/course/view.php?id=9326
    # train.csv: training dataset. 
    # test.csv: testing dataset. 
    
# for this day we will work only with train.csv. 


# 1. libraries: 
import pandas as pd

# 2. read csv file: 
df = pd.read_csv('train.csv', index_col='PassengerId')
# tmp added because it had another path but use just 'train.csv' for the path.
df

# 3. Understand the attributes/variables of the dataset.
    # variable sibSp: Siblings and sposes on board. (understand it by the dataset or the data description)
    # Parch: we need to remove 889 row because since age is NaN we can understand if it is a children with
       # two parents (sibSp, Parch)
      # or a family with two children. 
    # we need to improve it because skiping the missing values is not always the best option 
       #(we can omit some meaning if we delete too much rows). 
    
    # Ticket: Ticket number. 
    # Fare: How much they pay. 
    # Cabin: Cabin of the passenger. 
    # Embarked: Port where the passenger embarked. 
    
    # categorical variables vs numerical variables. 
    
    
    


    
    

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
# the first thing we are going to do is to impute missing values: 
df.isnull()
# this is not a good summary, for improve it we use the method any(): 
df.isnull().any()
# we can see which variables have missing values. 

Survived    False
Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin        True
Embarked     True
dtype: bool

In [None]:
# we have to do something with Age, Cabin, Embarked. 
# we need to improve the missing values. 

# which variables are not relevant? 
    # to discard a variable you need to be sure that is irrelevant for the goal we have
      # (predict if the passenger will survive or not based on the other variables). 
    
        # is the name relevant? is very difficult to use the name because is not relevant. 
          # it could be used by we need NLP. The effort that not compensate the info we can extract
          # from the name in this case. When we mix letters and numbers (text value). 
          # categorical values: Can only take some values like an enum. When it has text but is not categorical are
          # text values and then we need NLP. 
        
        # then we can remove name, ticket and cabin. 
        
        
df.pop('Name')
df.pop('Ticket')
df.pop('Cabin')
# pop modify the dataframe. With drop you need to use the argument "inplace=true"

PassengerId
1       NaN
2       C85
3       NaN
4      C123
5       NaN
       ... 
887     NaN
888     B42
889     NaN
890    C148
891     NaN
Name: Cabin, Length: 891, dtype: object

In [None]:
# now we are going to split the dataframe in categorical and numerical
# 
df.dtypes

cat_mask = (df.dtypes==object) 
# return the columns that are not numerical encoded.
cat_cols = df.columns[cat_mask].tolist()
# list with the name of the columns that fulfills the mask. 
df_cat = df[cat_cols] # then we obtain only the categorical columns

df_num = df.drop(cat_cols, axis=1)
# so we have in df_cat the categorical columns and in df_num the numerical columns. 



In [None]:

from sklearn.impute import SimpleImputer
# with a simple inputer we can input the missing values by statistical summary of variables. 
# to input the port into the person. If you don't have the value where the person embarked? 
    # Use classification methods using the values of the other variables to impute this. 
    # we use a simple method that is the most frequent. 
    
imp_cat = SimpleImputer(strategy='most_frequent')

 
#imp_cat.fit
# we are going to learn the function that transform our data. 
# impute the missing values with the most: 
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), 
            columns=df_cat.columns, index=df_cat.index)
df_cat

Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,male,S
2,female,C
3,female,S
4,female,S
5,male,S
...,...,...
887,male,S
888,female,S
889,female,S
890,male,C


In [None]:
df_cat.isnull().any()
# to check we don't have missing values


Sex         False
Embarked    False
dtype: bool

In [None]:
# most of the algorithms can not work with text values, so we are going to transform them into numerical 
  # values. 
# we will use a Label Encoder for each column (we apply a different transformation for each column). We create a
  # dictionary of Label Encoder: 
    
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder) # dictionary of Label Encoder. Two different entries (sex and embarked)

# to transform the dataset we are going to apply a lambda function (data wrangling lesson) to the dataframe: 

df_cat_le = df_cat.apply(lambda col: d[col.name].fit_transform(col))
# we apply to the df a function (fit_transform) but moreover the function will be different for each column. The function depends
# on the entry of the dictionary, so I have a label encoder for each entry. 
# lambda function allow us to use a different Label Encoder function to each entry. 

# this is enough for transform our data.
df_cat_le


Unnamed: 0_level_0,Sex,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,0,0
3,0,2
4,0,2
5,1,2
...,...,...
887,1,2
888,0,2
889,0,2
890,1,0


In [None]:
# we transform text and categorical values in numerical values. 
    # but which is the problem? 
# we are introducing an order that before not existed. 

'''
If we consider: 
port A codified as 0
port B codified as 1
port C codifed as 2

If we treat them as numerical values we are introducing an order that was not desired. 
For solve this we need one-hot encoding. 

C  M  S
1  0  0
0  1  0
0  0  1

Doing this we are not introducing an order. 

It depends on the data if the data have an order decided then is correct to use label encoding: 
Example: 

First. 0
Second. 1
Third. 2

Here the order is desired so it makes sense to use label encoding rather than one-hot encoding. 

'''

# to solve this we can use a different encoder that is called "one-hot encoder". 
# first apply an inverse transformation (undo the transformation we did)
aux = df_cat_le.apply(lambda col: d[col.name].inverse_transform(col))

In [None]:
# one-hot encoding. 
# draw 1 OneNote. 
# disadvantage: Just only one column could be 1, but is better than introduce an order in a not ordered variable before
   # (last transformation). 
    
# we need to split this df in two one for transform with label encoder(sex) and the other with one-hot encoder. 
# for simplicity we just only use one-hot encoder: 
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
            columns=ohe.get_feature_names(df_cat.columns.tolist()),
            index=df_cat.index) # preserve the structure of the df (changing it). 
# get_feature_names: return the actual names of the columns. 



In [None]:
df_cat_ohe


Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0
888,1.0,0.0,0.0,0.0,1.0
889,1.0,0.0,0.0,0.0,1.0
890,0.0,1.0,1.0,0.0,0.0


In [None]:
# we have the categorical variables transformed. 
# now we are going to improve the numerical values: 
  # now we use Regression Methods.
    # we use the mean method
imp_num = SimpleImputer(strategy='mean')
df_num = pd.DataFrame(imp_num.fit_transform(df_num), 
                     columns=df_num.columns, 
                     index=df_num.index)
df_num



Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...
887,0.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,1.0,19.000000,0.0,0.0,30.0000
889,0.0,3.0,29.699118,1.0,2.0,23.4500
890,1.0,1.0,26.000000,0.0,0.0,30.0000


In [None]:
# in row 889 we can see that has been imputed by the other values. 
# check if we have missing values. We must not have it. 
df_num.isnull().any()


Survived    False
Pclass      False
Age         False
SibSp       False
Parch       False
Fare        False
dtype: bool

In [None]:
# Since we have splited the dataframes now we are going to merge them into one df. 

df_preprocessed = pd.merge(left=df_cat_ohe, 
                           right=df_num,
                           on='PassengerId')
df_preprocessed
# since we are combining the two dataframes we need to specify by which column we want to merge the values. 
   # in this case using the index. (means: merge the values of passenger_Id=1 from the left 
    # with the values of passenger_id=1 from the right)

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,1.0,0.0,0.0,1.0,0.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,0.0,1.0,0.0,0.0,1.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,0.0,0.0,0.0,1.0,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,0.0,0.0,0.0,1.0,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,1.0,0.0,0.0,1.0,0.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0,0.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,0.0,0.0,0.0,1.0,1.0,1.0,19.000000,0.0,0.0,30.0000
889,1.0,0.0,0.0,0.0,1.0,0.0,3.0,29.699118,1.0,2.0,23.4500
890,0.0,1.0,1.0,0.0,0.0,1.0,1.0,26.000000,0.0,0.0,30.0000


In [None]:
# then we have the same df we had at the first step but without missing values. 

In [None]:
# let's continue with feature selection = select the relevant values that need to be correlated with survived. 
# there are methods that allow us to know which values are the best correlated for that. 

class_col = df_preprocessed.pop('Survived')
# create the class_col by the target class.
from sklearn.feature_selection import SelectKBest # select the variables that are the most correlated with the target variable. 
# it perform also a chi-square test for doing it. 
from sklearn.feature_selection import chi2

fs_k_best_chi2 = SelectKBest(chi2, k=4) # select the four most relevant variables for the problem
fs_k_best_chi2.fit(df_preprocessed, class_col) # class_col: target variable (supervised learning). This only
# fits the algorithm. 
# now we are going to see the support of each column: 
col_filter = fs_k_best_chi2.get_support()
print(col_filter)
# say which columns we will need to preserve.

[ True  True False False False  True False False False  True]


In [None]:
# so that we can filter this columns: 
df_k_best_chi2 = df_preprocessed.iloc[:, col_filter]
df_k_best_chi2 
# the most relevant columns by a chi-square test

Unnamed: 0_level_0,Sex_female,Sex_male,Pclass,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,1.0,3.0,7.2500
2,1.0,0.0,1.0,71.2833
3,1.0,0.0,3.0,7.9250
4,1.0,0.0,1.0,53.1000
5,0.0,1.0,3.0,8.0500
...,...,...,...,...
887,0.0,1.0,2.0,13.0000
888,1.0,0.0,1.0,30.0000
889,1.0,0.0,3.0,23.4500
890,0.0,1.0,1.0,30.0000


In [None]:
# Select a set of relevant features: 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
# now we use another method: 
fs_k_best_mi = SelectKBest(mutual_info_classif, k=4) # select the four most relevant variables for the problem
fs_k_best_mi.fit(df_preprocessed, class_col) # class_col: target variable (supervised learning). This only
# fits the algorithm. 
# now we are going to see the support of each column: 
col_filter = fs_k_best_mi.get_support()
print(col_filter)
# say which columns we will need to preserve.

# so that we can filter this columns: 
df_k_best_mi = df_preprocessed.iloc[:, col_filter]
df_k_best_mi 
# the most relevant columns by a chi-square test

# investigate wraper broader approach


[ True  True False False False  True False False False  True]


Unnamed: 0_level_0,Sex_female,Sex_male,Pclass,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,1.0,3.0,7.2500
2,1.0,0.0,1.0,71.2833
3,1.0,0.0,3.0,7.9250
4,1.0,0.0,1.0,53.1000
5,0.0,1.0,3.0,8.0500
...,...,...,...,...
887,0.0,1.0,2.0,13.0000
888,1.0,0.0,1.0,30.0000
889,1.0,0.0,3.0,23.4500
890,0.0,1.0,1.0,30.0000


In [None]:
'''
DOUBTS: 
How do you select the number or varables? trial and error.

The columns may be random selected (sklearn introduce a randomization) example: 
sometimes are "sex_female"
sex_male", "Pclass", "Fare" and other times are other variables. 
'''

# Learn the model:

From Slide 17. 

Hands on from slide 1 hands-on part


In [None]:
df_preprocessed # Starting point. 

Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,1.0,0.0,0.0,1.0,3.0,22.000000,1.0,0.0,7.2500
2,1.0,0.0,1.0,0.0,0.0,1.0,38.000000,1.0,0.0,71.2833
3,1.0,0.0,0.0,0.0,1.0,3.0,26.000000,0.0,0.0,7.9250
4,1.0,0.0,0.0,0.0,1.0,1.0,35.000000,1.0,0.0,53.1000
5,0.0,1.0,0.0,0.0,1.0,3.0,35.000000,0.0,0.0,8.0500
...,...,...,...,...,...,...,...,...,...,...
887,0.0,1.0,0.0,0.0,1.0,2.0,27.000000,0.0,0.0,13.0000
888,1.0,0.0,0.0,0.0,1.0,1.0,19.000000,0.0,0.0,30.0000
889,1.0,0.0,0.0,0.0,1.0,3.0,29.699118,1.0,2.0,23.4500
890,0.0,1.0,1.0,0.0,0.0,1.0,26.000000,0.0,0.0,30.0000


In [None]:
# Slide 2: 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_preprocessed, class_col, 
                                                    test_size = 0.3, random_state=1)
x_train


Unnamed: 0_level_0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
115,1.0,0.0,1.0,0.0,0.0,3.0,17.000000,0.0,0.0,14.4583
875,1.0,0.0,1.0,0.0,0.0,2.0,28.000000,1.0,0.0,24.0000
77,0.0,1.0,0.0,0.0,1.0,3.0,29.699118,0.0,0.0,7.8958
877,0.0,1.0,0.0,0.0,1.0,3.0,20.000000,0.0,0.0,9.8458
675,0.0,1.0,0.0,0.0,1.0,2.0,29.699118,0.0,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...
716,0.0,1.0,0.0,0.0,1.0,3.0,19.000000,0.0,0.0,7.6500
768,1.0,0.0,0.0,1.0,0.0,3.0,30.500000,0.0,0.0,7.7500
73,0.0,1.0,0.0,0.0,1.0,2.0,21.000000,0.0,0.0,73.5000
236,1.0,0.0,0.0,0.0,1.0,3.0,29.699118,0.0,0.0,7.5500


The rows have a random order. 
We have a subset of the original dataset 891 * 0.7 = 623.7 (rounds to the floor value).

In [None]:
# Learn a decission tree (sLIDE 3)

from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=1)
tree.fit(x_train, y_train)
# We are ready to make predictions (learnt the decission tree)

DecisionTreeClassifier(random_state=1)

In [None]:
# How to make predictions? 
y_pred = tree.predict(x_test)
y_pred

array([1., 0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 0.

In [None]:
# print the values that should be predicted and the values that we have predicted: 
# To check the errors of the algorithm. 

print(y_test)
print(y_pred)

PassengerId
863    1.0
224    0.0
85     1.0
681    0.0
536    1.0
      ... 
248    1.0
552    0.0
240    0.0
485    1.0
93     0.0
Name: Survived, Length: 268, dtype: float64
[1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1.
 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 1.]


In [None]:
# Confussion matrix: 

from sklearn.metrics import confusion_matrix

conf = pd.DataFrame(confusion_matrix(y_test, y_pred),
            columns = ["Predicted 0", "Predicted 1"], 
            index = ["True 0", "True 1"])
conf

Unnamed: 0,Predicted 0,Predicted 1
True 0,127,26
True 1,43,72


In [None]:
# Acuracy metrics: 

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

'''
    The accuracy give the % of samples that are good predicted. 
    Review how interpret the results. 

'''

              precision    recall  f1-score   support

         0.0       0.75      0.83      0.79       153
         1.0       0.73      0.63      0.68       115

    accuracy                           0.74       268
   macro avg       0.74      0.73      0.73       268
weighted avg       0.74      0.74      0.74       268



In [None]:
# Modificating some parameters because the accuracy is not good: 
# Decission tree tend to overfit the model respect the data. 
# To try to improve it: 

tree = DecisionTreeClassifier(random_state=1, max_depth=4)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)

conf2 = pd.DataFrame(confusion_matrix(y_test, y_pred),
            columns = ["Predicted 0", "Predicted 1"], 
            index = ["True 0", "True 1"])
conf2


Unnamed: 0,Predicted 0,Predicted 1
True 0,142,11
True 1,49,66


In [None]:
print(y_test)
print(y_pred)

PassengerId
863    1.0
224    0.0
85     1.0
681    0.0
536    1.0
      ... 
248    1.0
552    0.0
240    0.0
485    1.0
93     0.0
Name: Survived, Length: 268, dtype: float64
[1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1.
 0. 0. 0. 0.]


In [None]:
print(classification_report(y_test, y_pred))

# Use parameters of slide 3. 

              precision    recall  f1-score   support

         0.0       0.74      0.93      0.83       153
         1.0       0.86      0.57      0.69       115

    accuracy                           0.78       268
   macro avg       0.80      0.75      0.76       268
weighted avg       0.79      0.78      0.77       268



In [None]:
tree = DecisionTreeClassifier(random_state=1, max_depth=4, min_samples_split=5,
                             min_samples_leaf=3)
tree.fit(x_train, y_train)
y_pred = tree.predict(x_test)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

         0.0       0.74      0.93      0.83       153
         1.0       0.86      0.57      0.69       115

    accuracy                           0.78       268
   macro avg       0.80      0.75      0.76       268
weighted avg       0.79      0.78      0.77       268



Doubts: 
    
    Is it good to use DT for regressions? 
    Is better to use Regression Trees but are not the same as DT. 

## Other models: 
### Ensamble learning: 

Consists on learning multiple models to solve a problem. 

Slide 8

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

# Follow the code of the slide. 


In [None]:
from sklearn.svm import SVC

svm = SVC(C=10)
svm.fit(x_train, y_train)


'''
    Why the accuracy decrease? 
    The features are not in the same scale.

'''

In [None]:
# Remember we can not scale variables that are not numerical. 

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
df_num_sc = pd.DataFrame(mms.fit_transform(df_num), columns = df_num.columns,
                         index = df_num.index)
# we put in a df for maintain the names of columns and number of rows. 

df_preprocessed_sc = pd.merge(left=df_cat_ohe, right=df_num_sc, on='PassengerId')

x_train_sc, x_test_sc, y_train_sc, y_test_sc = train_test_split(df_preprocessed_sc, class_col,
                                                               test_size=0.3, random_state=1)

print(classification_report(y_test_sc, y_pred_sc))

conf3 = pd.DataFrame(confusion_matrix(y_test_sc, y_pred_sc),
                    columns=['Predicted 0', 'Predicted 1'], 
                    rows = ['True 0', 'True 1'])
conf3


# The results should be sth as in slide 15.
# The accuracy can not be 1 (if it is you have an error). 