In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE

In [27]:
titanic = pd.read_csv('titanic_train.csv')

In [28]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [None]:
## Check for null values
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         2
dtype: int64

In [13]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          572 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


In [29]:
## Handling null values 
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode())
#titanic['Cabin'] = titanic['Cabin'].fillna(titanic['Cabin'].mode())

In [30]:
titanic['Embarked'].value_counts()

Embarked
S    525
C    125
Q     60
Name: count, dtype: int64

In [31]:
## Encoding
titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
titanic['Embarked'] = titanic['Embarked'].astype('category').cat.codes

In [None]:
titanic['Deck'] = titanic['Cabin'].str[0]


In [23]:
titanic['Deck'].fillna('U', inplace=True)  # U = Unknown
titanic.drop('Cabin', axis=1, inplace=True)

In [25]:
titanic['Deck'].value_counts()

Deck
U    553
C     52
B     38
E     24
D     19
F     12
A     10
G      3
T      1
Name: count, dtype: int64

In [32]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    int8   
dtypes: float64(2), int64(6), int8(1), object(3)
memory usage: 62.0+ KB


In [72]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})

titanic['Embarked'] = titanic['Embarked'].astype('category').cat.codes

In [73]:
titanic['Familysize'] = titanic['SibSp'] + titanic['Parch'] + 1

titanic['Isalone'] = (titanic['Familysize'] == 1).astype(int)

titanic['HasCabin'] = titanic['Cabin'].notnull().astype(int)

titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False).map(
    {'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}
).fillna(4)

titanic['Pclass_Fare'] = titanic['Pclass'] * titanic['Fare']

titanic['Age_Fare'] = titanic['Age'] * titanic['Fare']

In [37]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Familysize,Isalone,HasCabin,Title,Pclass_Fare
0,332,0,1,"Partner, Mr. Austen",0,45.5,0,0,113043,28.5,C124,2,1,1,1,0.0,28.5
1,734,0,2,"Berriman, Mr. William John",0,23.0,0,0,28425,13.0,,2,1,1,0,0.0,26.0
2,383,0,3,"Tikkanen, Mr. Juho",0,32.0,0,0,STON/O 2. 3101293,7.925,,2,1,1,0,0.0,23.775
3,705,0,3,"Hansen, Mr. Henrik Juul",0,26.0,1,0,350025,7.8542,,2,2,0,0,0.0,23.5626
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",1,6.0,4,2,347082,31.275,,2,7,0,0,1.0,93.825


In [38]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    int8   
 12  Familysize   712 non-null    int64  
 13  Isalone      712 non-null    int64  
 14  HasCabin     712 non-null    int64  
 15  Title        712 non-null    float64
 16  Pclass_Fare  712 non-null    float64
dtypes: float64(4), int64(9), int8(1), object(3)
memory usage: 89.8+ KB


In [41]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Familysize', 'Isalone',
       'HasCabin', 'Title', 'Pclass_Fare'],
      dtype='object')

In [74]:
X = titanic[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Familysize', 'Isalone', 'HasCabin', 'Title', 'Pclass_Fare', 'Age_Fare']]
y = titanic['Survived']


In [76]:
X.isnull().sum()

Pclass           0
Sex            712
Age              0
Fare             0
Embarked         0
Familysize       0
Isalone          0
HasCabin         0
Title            0
Pclass_Fare      0
Age_Fare         0
dtype: int64

In [75]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values