In [2]:
import os

TITANIC_PATH = '/cxldata/datasets/project/titanic'

#It consists of 2 files, train.csv and test.csv

In [3]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [4]:
# STEP 2 ) Exploring the data

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Checking if there are missing values

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
 # Find the number of female passengers on-board from the training dataset.
    
train_data['Sex'].value_counts()[1]   

314

In [8]:
# STtp 3 ) Creating pre-processing Pipelines

from sklearn.base import BaseEstimator,TransformerMixin

# create the Pipeline

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]


**BaseEstimator**: This class provides implementations for the get_params() and set_params() methods, which are essential for parameter tuning and grid search. When you inherit from BaseEstimator, your custom transformer gains these methods automatically. They allow you to access and modify the parameters of your transformer as part of a pipeline.

**TransformerMixin**: This class provides implementations for common transformer methods like fit_transform(). When you inherit from TransformerMixin, your custom transformer gains these methods automatically. This mixin ensures that your transformer behaves like a proper transformer, which can fit to data and transform it.

**.fit() method in the above transformer**

Input Data Usage (Optional):

The fit method takes the input DataFrame X, but it doesn't actually use the data for any learning or computation.
The y parameter, which represents the target variable, is set to None, indicating that this transformer doesn't require the target variable for fitting.
No Computation:

Inside the fit method, there's no computation or learning involved. It simply returns self, which is a common practice in scikit-learn transformers.
Adherence to Interface:

The purpose of this dummy fit method is to adhere to the scikit-learn transformer interface, which requires every transformer to have a fit method.
By implementing this method, the DataFrameSelector class can be used seamlessly in scikit-learn pipelines alongside other transformers and models.






**.transform() method in the above transformer**

Input Data Selection:

The transform method takes the input DataFrame X.
It selects only the columns specified in self.attribute_names from the input DataFrame.
Output Data:

The function returns a new DataFrame containing only the selected columns.
Essentially, it transforms the input DataFrame by selecting only the specified columns.
For example, suppose self.attribute_names is ['age', 'gender', 'income'], and X is a DataFrame containing multiple columns such as ['age', 'gender', 'income', 'education', 'marital_status']. The transform method will return a new DataFrame containing only the columns ['age', 'gender', 'income'].

In [9]:
# Buildin Pipeline for the Numerical Attributes

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline =  Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

# fit the training data in this pipeline

num_pipeline.fit_transform(train_data)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])


In the num_pipeline, a sequence of transformation steps is defined to preprocess numerical attributes. Let's break down each step:

1) **DataFrameSelector**:

The first step is named "select_numeric" and involves a DataFrameSelector transformer.
This transformer selects specific columns from the input DataFrame (train_data) based on the provided attribute names: "Age", "SibSp", "Parch", and "Fare".
It extracts only the numerical attributes from the input DataFrame.

2) **SimpleImputer**:

The second step is named "imputer" and utilizes the SimpleImputer transformer.
This transformer handles missing values in the selected numerical attributes using a strategy specified by the strategy parameter.
In this case, the strategy is set to "median", meaning missing values are replaced with the median value of each attribute.

3) **Pipeline**:

These transformation steps are combined into a pipeline using the Pipeline class from scikit-learn.
Pipelines allow chaining multiple transformers together, providing a convenient way to apply preprocessing steps sequentially.

4) **Fitting and Transformation**:

Once the num_pipeline is defined, the fit_transform method is called on it with the training data (train_data) as input.
This triggers the execution of each transformation step in sequence:
The DataFrameSelector selects the specified numerical attributes.
The SimpleImputer fills missing values in these attributes with their median values.

In [10]:
# imputer for the string categorical columns (the regular SimpleImputer does not work on those


class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)
    








## Notes

The code above is for creating a custom imputer class called `MostFrequentImputer` for handling missing values in categorical columns. This custom imputer is designed to replace missing values with the most frequent value (mode) for each categorical column in your dataset. It is intended to be used as part of a data preprocessing pipeline in our project.

Let's break down the code and its purpose step by step:

1. **Custom Imputer Class Definition:**

class MostFrequentImputer(BaseEstimator, TransformerMixin):
   - `MostFrequentImputer` is a custom imputer class that is defined by inheriting from two classes: `BaseEstimator` and `TransformerMixin`. These are common base classes used when creating custom transformers in scikit-learn.

2. **`fit` Method:**   

def fit(self, X, y=None):
   - The `fit` method is used to compute and store the most frequent value (mode) for each categorical column in the input dataset `X`. It takes two arguments, `X` (the input data) and `y` (which is not used in this case).

self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
   - Inside the `fit` method, it calculates the mode for each column in `X` and stores these modes in a Pandas Series called `self.most_frequent_`. The Pandas Series has column names as indices and the corresponding most frequent values as values.

3. **`transform` Method:**

def transform(self, X, y=None):
    return X.fillna(self.most_frequent_)
   - The `transform` method takes an input dataset `X` and replaces missing values with the most frequent values computed during the `fit` step. It uses the Pandas `fillna` method to fill missing values in the input dataset `X` with the most frequent values stored in `self.most_frequent_`. This transformed dataset is then returned.

4. **Overall Purpose:**

The purpose of this custom imputer class is to handle missing values in categorical columns by imputing them with the most frequent value observed in each column during the `fit` step. This can be useful in data preprocessing when preparing data for machine learning models, as many machine learning algorithms cannot handle missing values directly and require them to be imputed.

In [11]:
# Build the Pipeline for the Categorical Attributes


from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])



In [12]:
# Will join both the Pipelines now:


from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

#Fit the training data in this pipeline

X_train = preprocess_pipeline.fit_transform(train_data)

y_train = train_data["Survived"]

In [13]:
# Step 4 : Train SVC Classifier

from sklearn.svm import SVC

svm_clf =SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train, y_train)



SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [14]:
# Predict using Test Set



X_test = preprocess_pipeline.transform(test_data)
y_pred = svm_clf.predict(X_test)


In [15]:
#  Evaluate our SVC Model

from sklearn.model_selection import cross_val_score

# Generate cross validation score for the model

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()



0.7329588014981274

Cross-validation is a crucial technique to assess the performance of machine learning models, including SVM and Random Forest, without relying solely on a single train-test split. It helps in estimating how well a model generalizes to new data by dividing the dataset into multiple subsets (folds) for training and testing.

We choose performance metrics according to our needs. For example, precision is used when we want to predict correct positive values, while recall is used when we want to capture as many true positives as possible, minimizing false negatives. So we can explore these metrics on the data and see how it works out.

In [16]:
# Train Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()




0.8126466916354558

In [18]:
os.makedirs('models', exist_ok=True)

In [19]:
import joblib

# Assuming `svc_model` is your trained SVC model and `rf_model` is your trained RandomForest model

# Save the SVC model
joblib.dump(svm_clf, 'models/svm_clf.pkl')

# Save the RandomForest model
joblib.dump(forest_clf, 'models/forest_clf.pkl')


['models/forest_clf.pkl']

In [20]:
# Load the SVC model
loaded_svc_model = joblib.load('models/svm_clf.pkl')

# Load the RandomForest model
loaded_rf_model = joblib.load('models/forest_clf.pkl')
