# Project Name

[Project Description]

# Import Packages

# 1. Load the Data

In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# 2. Understand the Data

In [None]:
display(train_df.head(5))
display(test_df.head(5))

In [None]:
display(train_df.info())
display(test_df.info())

In [None]:
train_df.describe()

In [None]:
test_df.describe()

# 3. Data Cleaning

## 3.1. Check for duplicates

In [None]:
train_df.duplicated().any()

In [None]:
test_df.duplicated().any()

## 3.2. Check for missing data
Let's check for 0, blank, NaN or None values.

In [None]:
pd.concat([
    (train_df == 0).sum().rename('zeros'),
    (train_df == '').sum().rename('blanks'), 
    train_df.isna().sum().rename('nan'),
    (train_df == None).sum().rename('none')
], axis=1)

In [None]:
pd.concat([
    (test_df == 0).sum().rename('zeros'),
    (test_df == '').sum().rename('blanks'), 
    test_df.isna().sum().rename('nan'),
    (test_df == None).sum().rename('none')
], axis=1)

# 4. Exploratory Data Analysis

## 4.1. Univariate analysis

For each categorical variable, display the bar plot.

For each numerical variable, show histograms, measures of central tendency (mean, median, mode), and measures of dispersion (range, standard deviation, skewness, kurtosis). 

## 4.2. Bivariate analysis

Understand the relationships between features and the target variable using scatterplots, correlation coefficients / matrix.

## 4.3. Handle Outliers

## 4.4. Handle missing values

# 5. Feature engineering

## 5.1. Create new features

Let's engineer relevant features that might improve predictive performance.

## 5.2 Remove irrelevant features
Let's eliminate features that do not contribute much to the prediction. Many ML algorithms, such as Random Forest, provide a feature importance score.

In [None]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

## 5.3. Feature Normalization

Many models assume normally distributed data. Let's fix skewed features by applying log transform.

In [None]:
for col in quantitative:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

## 5.4. Feature Scaling

Scaling numerical features improves distance-based calculations (for KNN, SVM classifiers) and prevents feature dominance.

In [None]:
def scale_features(df):
    """ scale numerical features """

    mew = df[quantitative].mean(axis=0)
    std = df[quantitative].std(axis=0)
    df[quantitative] = (df[quantitative] - mew) / std

    return df

## 5.5. Remove colinear features

This improves the model's stability and interpretability, and reduces overfitting.

## 5.6. Encode categorical features

Let's convert categorical features into numerical form using techniques like one-hot encoding or label encoding.

Note : using drop_first=True creates 1 column instead of 2 for 2 categories (dummy variable trapping). This avoids multicollinearity between columns.

In [None]:
def one_hot_encoding(df):
    """ 
    Perform one hot encoding on features Sex, Pclass, Deck, Embarked and Title. 
    Concatenate to the main dataframe.
    """
    df_sex = pd.get_dummies(df['Sex'], prefix='sex', drop_first=True, dtype=int)
    df_Pclass = pd.get_dummies(df['Pclass'], prefix='class', drop_first=True, dtype=int)
    df_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
    df_Deck = pd.get_dummies(df['Deck'], prefix='Deck', drop_first=False, dtype=int)
    df_Title = pd.get_dummies(df['Title'], prefix='Title', drop_first=False, dtype=int)

    df = pd.concat([df, df_sex, df_Pclass, df_Embarked, df_Deck, df_Title], axis=1)
    return df 

# 6. Choose an Evaluation Metrics
For regression problems, common metrics include:
- Mean Absolute Error (MAE)
- Mean Squared Error (MSE)
- Root Mean Squared Error (RMSE)
- R-squared

For classification problems, common metrics include:
- Accuracy
- Precision
- Recall
- F1-score
- AUC

The confusion matric and ROC Curve can also bring useful insights. 

# 7. Select Algorithms
- Start with simple regression algorithms like Linear Regression and gradually explore more complex models like Random Forest, Gradient Boosting, or XGBoost.
- Consider simple ensemble methods, such as simple average, weighted average, or voting ensembles, to combine multiple models for potentially better results.
- The model chosen depends on the data. A more complex model does not always constitute a better model.

# 8. Model Validation
- Split the data into training and validation sets. A common split is 70-30 or 80-20 for training and validation, respectively. This method is computationally less intensive and often used for initial model exploration or when dealing with very large datasets.
- K-Fold Cross Validation. This method provides a more reliable evaluation, especially with smaller datasets.
- Model validation is important to assess the model's generalization performance (i.e. assess how well the model performs on unseen data). This helps prevent overfitting and gives you a more reliable estimate of your model's performance.

# 9. Hyperparameter Tuning
- Tune the hyperparameters of your chosen algorithms on the validation dataset using techniques like grid search or random search to find the best combination.
- Optuna is an efficient and effective way to search for optimal hyperparameters.

# 10. Regularization
- Implement regularization techniques like L1 (Lasso) or L2 (Ridge) regularization to prevent overfitting.
- Many ML algorithms include regularization parameters, including L1 and L2, sometimes called reg_alpha or reg_lambda. Read up on your chosen algorithms regularization parameters and tune them accordingly on your validation set.

# 11. Train the final model

- Fit the best model using the optimal hyperparameters found on the whole training set (including the validation set)
- Model persistence : save the model weights for future use.

In [None]:
def classification_model(model, X_train, X_test, y_train):
    """
    Train a classification model and assessing performance
    model: eg. model = LogisticRegression()
    X_train: train dataframe without the target column
    X_test: test dataframe    
    y_train: target column
    """
    
    # Use model class name as model name
    model_name = model.__class__.__name__
    print(f"Training: {model_name}")    
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Perform cross-validation with 5 folds
    scores = cross_val_score(model, X_train, y_train, cv=5)
    cv_mean = np.mean(scores)
    cv_std = np.std(scores)
    
    # Predictions on train set
    y_pred_train = model.predict(X_train)
    
    # Predict on test set 
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    accuracy = metrics.accuracy_score(y_train, y_pred_train)
    precision = metrics.precision_score(y_train, y_pred_train, average='weighted', zero_division=0)
    recall = metrics.recall_score(y_train, y_pred_train, average='weighted', zero_division=0)
    f1 = metrics.f1_score(y_train, y_pred_train, average='weighted', zero_division=0)
    
    # Create results dictionary
    results_dict = {
        'Model_Name': model_name,
        'Train_Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1,
        'CV_Mean': cv_mean,
        'CV_Std': cv_std
    }
    
    return y_pred_test, y_pred_train, results_dict

# 12. Generate predictions on the test set