In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

now let split our data set into X_train, X_valid, y_train, y_valid

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Let's read the dataset
X_full = pd.read_csv('/kaggle/input/titanic/train.csv', index_col = 'PassengerId')
X_test_full = pd.read_csv('/kaggle/input/titanic/test.csv', index_col = 'PassengerId')

# remove rows with missing target and seperate target from predictors
X_full.dropna(axis = 0, subset = ['Survived'], inplace = True)
y = X_full.Survived
X_full.drop(['Survived'], axis = 1, inplace = True)

# Let's use split our model
X_full_train, X_full_valid, y_train, y_valid = train_test_split(X_full, y , 
                                                                train_size = 0.8, test_size = 0.2, random_state = 0)

# Now let us select categorical data that has low cardinality
categorical_cols = [cname for cname in X_full_train.columns if X_full_train[cname].nunique() < 10 and
                    X_full_train[cname].dtype in ['object'] ]

# Now let us select numerical data 
numerical_cols =[cname for cname in X_full_train.columns if X_full_train[cname].dtype in ['float64', 'int64']]

# Let's make our columns known
my_cols= categorical_cols + numerical_cols

X_train = X_full_train[my_cols].copy()
X_valid = X_full_valid[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Let's view our training dataset
X_full.head()

In [76]:
print(categorical_cols)

In [77]:
X_train.head()

Now it is time to import our libaries and build our model

In [78]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy = 'median')

# preprocessing for categorical data 
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')), ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers = [('num',numerical_transformer, numerical_cols),
                                              ('cat', categorical_transformer, numerical_cols)])

# Define model 
model = RandomForestRegressor(n_estimators = 200, random_state = 1)

# Joining
clf = Pipeline(steps = [('preprocessor', preprocessor), ('model', model)])

# fitting our model
clf.fit(X_train, y_train)

# predict our variable
preds = clf.predict(X_valid)

# Check accuracy
print('MAE', mean_absolute_error(y_valid, preds))


In [79]:
#let's generate test predicts
preds_test = clf.predict(X_test)
predictions = []
for pred in preds_test:
    predictions.append(round(pred))
print(predictions)