In [1]:
# pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# featuretools for automated feature engineering
import featuretools as ft

# matplotlit and seaborn for visualizations
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

# Suppress warnings from pandas
import warnings
warnings.filterwarnings('ignore')

# modeling
import lightgbm as lgb

# utilities
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# memory management
import gc

In [2]:
train = pd.read_csv("/home/ericjiang/workStation/temporary/titanic/train.csv")
test = pd.read_csv("/home/ericjiang/workStation/temporary/titanic/test.csv")

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Remove Collinear Variables

In [4]:
# Threshold for removing correlated variables
threshold = 0.9
# Absolute value correlation matrix
corr_matrix = train.corr().abs()
corr_matrix

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,0.005007,0.035144,0.036847,0.057527,0.001652,0.012658
Survived,0.005007,1.0,0.338481,0.077221,0.035322,0.081629,0.257307
Pclass,0.035144,0.338481,1.0,0.369226,0.083081,0.018443,0.5495
Age,0.036847,0.077221,0.369226,1.0,0.308247,0.189119,0.096067
SibSp,0.057527,0.035322,0.083081,0.308247,1.0,0.414838,0.159651
Parch,0.001652,0.081629,0.018443,0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,0.5495,0.096067,0.159651,0.216225,1.0


In [5]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,,0.005007,0.035144,0.036847,0.057527,0.001652,0.012658
Survived,,,0.338481,0.077221,0.035322,0.081629,0.257307
Pclass,,,,0.369226,0.083081,0.018443,0.5495
Age,,,,,0.308247,0.189119,0.096067
SibSp,,,,,,0.414838,0.159651
Parch,,,,,,,0.216225
Fare,,,,,,,


In [6]:
# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('There are %d columns to remove.' % (len(to_drop)))

There are 0 columns to remove.


# Remove Missing Values

In [7]:
train_missing = (train.isnull().sum() / len(train)).sort_values(ascending = False)
train_missing

Cabin          0.771044
Age            0.198653
Embarked       0.002245
Fare           0.000000
Ticket         0.000000
Parch          0.000000
SibSp          0.000000
Sex            0.000000
Name           0.000000
Pclass         0.000000
Survived       0.000000
PassengerId    0.000000
dtype: float64

In [8]:
# Test missing values (in percent)
test_missing = (test.isnull().sum() / len(test)).sort_values(ascending = False)
test_missing

Cabin          0.782297
Age            0.205742
Fare           0.002392
Embarked       0.000000
Ticket         0.000000
Parch          0.000000
SibSp          0.000000
Sex            0.000000
Name           0.000000
Pclass         0.000000
PassengerId    0.000000
dtype: float64

In [9]:
# Identify missing values above threshold
train_missing = train_missing.index[train_missing > 0.75]
test_missing = test_missing.index[test_missing > 0.75]
all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 75%% missing values' % len(all_missing))

There are 1 columns with more than 75% missing values


In [10]:
train = train.drop(columns=all_missing)
test = test.drop(columns=all_missing)

In [11]:
print(train.shape)
print(test.shape)

(891, 11)
(418, 10)


# 填補NULL

# feature selection

In [12]:
train_labels = train['Survived']

In [13]:
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train.shape[1])
# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [14]:
# Fit the model twice to avoid overfitting
for i in range(2):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: Name, Sex, Ticket, Embarked