In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
import statsmodels.imputation.mice as mice
import os 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
import pickle

trained_model = 'model.pkl' 


#Check for the file, EDA and data preprocessing
def data_pre():

    data = ['train.csv','test.csv']  
    for i in data:
    
        #read the file
        try:
            df = pd.read_csv(i)
        except:
            print(i + "does not exist")
        
        # Remove the columns that do not have the predicting power
        df = df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
        
        #create dummy variables for Sex and Embarked
        df = pd.get_dummies(df, columns=["Sex", "Embarked"], drop_first=True)
        
        # create X and y(target) sets
        y = df['Survived']
        X = df.drop('Survived', axis=1)

        return([X,y])

#train the model using model using RandomForestClassifier
def rf(X, y):
    
    #using the imputer function to impute the missing data
    miss = Imputer(missing_values='NaN', strategy='median', axis=0)
    
    #randomforestclassifier
    rf_class = RandomForestClassifier(max_depth=10, min_samples_split=2, n_estimators=150, random_state=123)

    # apply imputation and randomforestclassifier
    a = [('imputation', miss), ('random_forest', rf_class)]

    # pipeline
    pl = Pipeline(a)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

    #save X_test & y_test to csv
    try:
        X_test.to_csv('x_test.csv')
        y_test.to_csv('y_test.csv')
    except:
        print("Error while saving the file")



    # fit the model
    try:
        model = pl.fit(X_train, y_train)
    except:
        print("Can't fit model, check code")

    return(model)



#Saving the trained model
def model_save(model, trained_model):
    try:
        sv = open(trained_model, 'wb')
        pickle.dump(model, sv)
    except:
        print("Error, please check the code")
    sv.close()


def main():
    X, y = data_pre()
    model = rf(X,y)
    model_save(model, trained_model)


if  __name__ =='__main__':
    main()