In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [3]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
print(train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [None]:
train.describe(include="all")

In [4]:
import joblib

In [7]:
def clean_transform_df(df):
    df=df.copy()
    df=df.set_index('PassengerId')

    #Extract titles
    df['Title']=df['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)

    title_mapping={'Mlle':'Miss','Ms':'Miss','Mme':'Mrs'}
    df['Title']=df['Title'].replace(title_mapping)

    rare_titles=df['Title'].value_counts()[df['Title'].value_counts()<10].index
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    
    title_mask=~df['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master', 'Rare'])
    df.loc[title_mask, 'Title']= df.loc[title_mask, 'Sex'].map({'male': 'Mr', 'female': 'Mrs'})

    title_age_medians={
        'Mr': 32.32,
        'Miss': 21.68,
        'Mrs': 35.86,
        'Master': 4.57,
        'Rare': 48.5
    }

    for title, median_age in title_age_medians.items():
        age_mask=(df['Age'].isnull())&(df['Title']==title)
        df.loc[age_mask, 'Age']=median_age

    #Replace inpace fillna operations
    df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare']=df['Fare'].fillna(df['Fare'].median())

    df['Age*Class']=df['Age']*df['Pclass']
    df['Age*Fare']=df['Age']*df['Fare']

    df_sex=pd.get_dummies(df['Sex'], prefix='sex', drop_first=True, dtype=int)
    df_Pclass=pd.get_dummies(df['Pclass'], prefix='class', drop_first=True, dtype=int)
    df_Embarked=pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
    df_Title=pd.get_dummies(df['Title'], prefix='Title', drop_first=False, dtype=int)

    df=pd.concat([df, df_sex, df_Pclass, df_Embarked, df_Title], axis=1)

    #Family features
    df['FamilySize']=df['SibSp']+df['Parch']+1
    df['IsAlone']=(df['FamilySize']==1).astype(int)
    df['Fare_Per_Person'] = df['Fare'] / df['FamilySize']

    # Fare and Age bands
    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[0, 1, 2, 3, 4])
    df['AgeBand'] = df['AgeBand'].astype(int)

    df['FareBand']=pd.qcut(df['Fare'], q=4, labels=[0,1,2,3])
    df['FareBand']=df['FareBand'].astype(int)

    #Log transformation
    df['Fare_log']=np.log1p(df['Fare'])

    df['Cabin'] = df['Cabin'].fillna('U0')
    df['Deck'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Deck'], drop_first=True)

    df=df.drop(['Sex', 'Pclass', 'Name', 'Ticket', 'Embarked', 'Cabin', 'Title', 'Fare', 'SibSp', 'Parch'], axis=1)

    #Scaling only scale numeric columns
    numeric_columns=df.select_dtypes(include=['float64', 'int64']).columns

    mew=df[numeric_columns].mean(axis=0)
    std=df[numeric_columns].std(axis=0)
    df[numeric_columns]=(df[numeric_columns]-mew)/std

    return df

In [8]:
'''
#Load model
model=joblib.load("/kaggle/input/titanic-random-forest-0.794-score/scikitlearn/default/2/titanic_model.pkl")
'''
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

y_train=train['Survived']
X_train=clean_transform_df(train.drop(columns=['Survived']))

model = lgb.LGBMClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

#Load and preprocess test data
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')
test_features=clean_transform_df(test_data)

train_columns = X_train.columns
for col in train_columns:
    if col not in test_features.columns:
        test_features[col] = 0

#Make predictions
predictions=model.predict(test_features)

#Create submission file
submission=pd.DataFrame({
    'PassengerID': test_features.index,
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)

train_preds=model.predict(X_train)
print('Training accuracy:', accuracy_score(y_train, train_preds))

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288
Training accuracy: 0.9809203142536476
