# Titanic dataset: Logistic Regression

First step: Imports and data

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from ISLP.models import (ModelSpec as MS, summarize, poly)
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import \
     (cross_validate,
      KFold)
from ISLP.models import sklearn_sm
import seaborn as sns


df = pd.read_csv("../train.csv")
print(df.shape)
print(df.columns)
print(df.dtypes)
for col in df.columns:
    print("Missing rows in {0}:".format(col), df[col].shape[0] - df[col].count())
print(df.describe())


y = df['Survived']

- Converting Sex and Embarked to a categorical value
- Converting Sex to a new variable "SexNr", where 0 is male and 1 is female
- Converting Cabin to a new variable "CabinLetter" with only the first letter used, then converting that letter into their corresponding number (e.g. A=1, B=2, etc.) until G=6, and then T=7

In [None]:
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['SexNr'] = df['Sex'].cat.codes
df['CabinLetter'] = df.Cabin.str[:1]
print(df.CabinLetter.unique())

df['CabinLetter'] = df['CabinLetter'].astype('category')
df['CabinLetter'] = df['CabinLetter'].cat.codes
df['CabinLetter'] = df['CabinLetter'].replace(-1, np.NaN)
print(df.CabinLetter.unique())

print(df.dtypes)

## Simple Logistic Regression

Logistic regression on every possible variable with plots for each of them.

Plotting the logistic regression line has not been as easy as with the linear regression line.
1. The original plot works, but I had to rewrite the formula in order for it to make sense. However, it has been designed to only give a straight line, so I will comment out its implementation.
2. I implemented another plot with matplotlib (second plot). It doesn't quite work, and other people on the internet with the same problem have had the issue that their X were unordered. So I fixed that and ordered it, and now it works for all except 'Embarked'.
3. Lastly, the third plot is an implementation using seaborn. It is the only implementation that does its own calculation of the results, while the other two rely on the results from the statsmodels.

In summary, the second and third plot work. The first one works, but only for straight lines.

Also, 'Embarked' is not visualized, since it is categorical with more than two values, so it doesn't work with these implementations. The logistic regression model still works though.

In [110]:
def abline(ax, b, m, *args, **kwargs):
    "Add a line with slope m and intercept b to ax"
    xlim = ax.get_xlim()
    #print(xlim)
    ylim = [(np.e ** (m * xlim[0] + b)) / (1 + (np.e ** (m * xlim[0] + b))),
            (np.e ** (m * xlim[1] + b)) / (1 + (np.e ** (m * xlim[1] + b)))]
    #print(ylim)
    #[m * xlim[0] + b, m * xlim[1] + b]
    ax.plot(xlim, ylim, *args, **kwargs)
    plt.show()

for column in ['PassengerId', 'Pclass', 'Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'CabinLetter', 'Embarked']:
    X = MS([column]).fit_transform(df)
    model = sm.Logit(y,X,
                   missing='drop')
    results = model.fit()
    print("Modell für {0}:\n".format(column), summarize(results))

    if column == 'Embarked':
        continue

    # 1. first plot
    #ax = df.plot.scatter(column, "Survived")
    #abline(ax, results.params[0], results.params[1], 'r', linewidth=3)

    # 2. second plot
    """sorted_X = X.sort_values(by=column)
    plt.scatter(X[column], y)
    plt.plot(sorted_X[column], results.predict(sorted_X), c="red", linewidth=3)
    plt.show()

    # 3. third plot
    sns.lmplot(x=column, y='Survived', data = df, logistic=True, ci=None)
    plt.show()"""
    #break
    

Optimization terminated successfully.
         Current function value: 0.665899
         Iterations 4
Modell für PassengerId:
                 coef  std err      z  P>|z|
intercept   -0.45540    0.138 -3.306  0.001
PassengerId -0.00004    0.000 -0.149  0.881
Optimization terminated successfully.
         Current function value: 0.608531
         Iterations 5
Modell für Pclass:
              coef  std err      z  P>|z|
intercept  1.4468    0.207  6.975    0.0
Pclass    -0.8501    0.087 -9.755    0.0
Optimization terminated successfully.
         Current function value: 0.672429
         Iterations 4
Modell für Age:
              coef  std err      z  P>|z|
intercept -0.0567    0.174 -0.327  0.744
Age       -0.0110    0.005 -2.057  0.040
Optimization terminated successfully.
         Current function value: 0.515041
         Iterations 5
Modell für Sex:
              coef  std err       z  P>|z|
intercept  1.0566    0.129   8.191    0.0
Sex[male] -2.5137    0.167 -15.036    0.0
Optimizat

It gives very similar results to the simple linear regression. Like there, 'Pclass', 'Age', 'Sex', 'Parch', 'Fare' and 'Embarked' are statistically important.

The next step is

## Multiple logistic regression

Like in multiple linear regression, on all possible columns at once. I also want to see whether the two implementations differ.

In [114]:
X_2 = MS(['PassengerId', 'Pclass', 'Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'CabinLetter', 'Embarked']).fit_transform(df)
model_2 = sm.Logit(y,X_2, missing = 'drop')
results = model_2.fit()
print(summarize(results))

model_2_b = sm.GLM(y,X_2, missing = 'drop', family=sm.families.Binomial())
results = model_2_b.fit()
print(summarize(results))

Optimization terminated successfully.
         Current function value: 0.423836
         Iterations 7
               coef  std err      z  P>|z|
intercept    4.4131    1.228  3.593  0.000
PassengerId  0.0016    0.001  1.991  0.046
Pclass      -0.5621    0.491 -1.144  0.253
Age         -0.0410    0.015 -2.775  0.006
Sex[male]   -2.9607    0.503 -5.891  0.000
SibSp        0.2647    0.360  0.736  0.462
Parch       -0.3838    0.334 -1.148  0.251
Fare         0.0010    0.003  0.330  0.742
CabinLetter -0.0064    0.166 -0.038  0.969
Embarked[Q] -1.8101    2.053 -0.882  0.378
Embarked[S] -0.4125    0.472 -0.873  0.383
               coef  std err      z  P>|z|
intercept    4.4131    1.228  3.593  0.000
PassengerId  0.0016    0.001  1.991  0.046
Pclass      -0.5621    0.491 -1.144  0.253
Age         -0.0410    0.015 -2.775  0.006
Sex[male]   -2.9607    0.503 -5.891  0.000
SibSp        0.2647    0.360  0.736  0.462
Parch       -0.3838    0.334 -1.148  0.251
Fare         0.0010    0.003  0.330  0

The implementation seems to be exactly the same. The [documentation](https://www.statsmodels.org/stable/generated/statsmodels.genmod.families.family.Binomial.html#statsmodels.genmod.families.family.Binomial) corroborates this, saying that logit is the default implementation of the Binomial model in the GLM models.

I continue with sm.Logit for its shorter writing.

Still significant are 'PassengerId' (that is expected to become less important as I delete variables from the model), 'Age' and 'Sex'. The last two are the same for both linear and logistic regression, but the linear regression also found 'Pclass' and 'SibSp' significant and not 'PassengerId'.

Again, as the next step, I will drop the CabinLetter variable, since it has too many missing values.

In [117]:
X_2 = MS(['PassengerId', 'Pclass', 'Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']).fit_transform(df)
model_2 = sm.Logit(y,X_2, missing = 'drop')
results = model_2.fit()
print(summarize(results))

Optimization terminated successfully.
         Current function value: 0.442474
         Iterations 6
               coef  std err       z  P>|z|
intercept    5.4851    0.657   8.350  0.000
PassengerId  0.0004    0.000   0.937  0.349
Pclass      -1.1971    0.165  -7.254  0.000
Age         -0.0432    0.008  -5.257  0.000
Sex[male]   -2.6574    0.223 -11.916  0.000
SibSp       -0.3550    0.130  -2.735  0.006
Parch       -0.0700    0.125  -0.562  0.574
Fare         0.0015    0.003   0.578  0.563
Embarked[Q] -0.8248    0.598  -1.379  0.168
Embarked[S] -0.4067    0.271  -1.502  0.133


This is looking much better. PassengerId is not important anymore, as we would expect. Pclass and SibSp have become significant. Now, all the same variables to the linear model are significant again.

I have done this quite late in the linear regression, but I will now replace all the missing values in 'Age' with their mean (a step that should normally happen at the start while cleaning the columns and handling missing values).

In [None]:
mean = np.mean(df['Age'])
df['AgeNoMissing'] = df['Age'].fillna(mean)

Now, I'll do backwards selection on the variables.

In [127]:
X_2 = MS(['Pclass', 'AgeNoMissing', 'Sex', 'SibSp']).fit_transform(df)
model_2 = sm.Logit(y,X_2, missing = 'drop')
results = model_2.fit()
print(summarize(results))

Optimization terminated successfully.
         Current function value: 0.443796
         Iterations 6
                coef  std err       z  P>|z|
intercept     5.1920    0.478  10.854  0.000
Pclass       -1.1724    0.120  -9.792  0.000
AgeNoMissing -0.0398    0.008  -5.131  0.000
Sex[male]    -2.7398    0.194 -14.112  0.000
SibSp        -0.3578    0.104  -3.439  0.001


Significant are:

- Pclass
- Age
- Sex
- SibSp

The same as with multiple linear regression.

## Multiple Logistic Regression with interaction

I will directly include all possible interaction terms (except for PassengerId since it's obviously irrelevant and Embarked, since they are categorical with more than two categories) and then do backwards selection. Of the non-interaction terms, I'll only include the four significant ones in the model.

In [130]:
X_3 = MS(['Pclass', 'AgeNoMissing', 'Sex', 'SibSp']).fit_transform(df)

items = ['Pclass', 'AgeNoMissing', 'SexNr', 'SibSp', 'Parch', 'Fare']
for first in range(len(items)):
    for second in range(len(items)):
        if first >= second: continue
        title = items[first] + ':' + items[second]
        X_3[title] = df[items[first]] * df[items[second]]
model_3 = sm.Logit(y,X_3)
results = model_3.fit()
print(summarize(results))

Optimization terminated successfully.
         Current function value: 0.408473
         Iterations 8
                       coef  std err      z  P>|z|
intercept            4.1440    1.601  2.589  0.010
Pclass              -1.1619    0.521 -2.229  0.026
AgeNoMissing         0.0251    0.041  0.605  0.545
Sex[male]           -3.7514    1.502 -2.497  0.013
SibSp                1.9061    1.085  1.756  0.079
Pclass:AgeNoMissing -0.0140    0.014 -1.007  0.314
Pclass:SexNr         0.8609    0.445  1.934  0.053
Pclass:SibSp        -0.8109    0.309 -2.627  0.009
Pclass:Parch         0.1700    0.121  1.408  0.159
Pclass:Fare          0.0075    0.007  1.144  0.252
AgeNoMissing:SexNr  -0.0444    0.021 -2.070  0.038
AgeNoMissing:SibSp   0.0031    0.013  0.234  0.815
AgeNoMissing:Parch  -0.0252    0.010 -2.400  0.016
AgeNoMissing:Fare    0.0003    0.000  1.064  0.287
SexNr:SibSp         -0.1358    0.311 -0.437  0.662
SexNr:Parch          0.9007    0.303  2.971  0.003
SexNr:Fare          -0.0122    

Again, dropping all non-significant predictors one by one.

In [157]:
dropping = ['SibSp:Parch', 'AgeNoMissing:SibSp', 'AgeNoMissing:Fare', 'SexNr:Fare',
            'SexNr:SibSp', 'Parch:Fare', 'AgeNoMissing', 'Pclass:AgeNoMissing']
for item in dropping:
    try:
        X_3 = X_3.drop(item, axis=1)
    except:
        continue
model_3 = sm.Logit(y,X_3)
results = model_3.fit()
print(summarize(results))

Optimization terminated successfully.
         Current function value: 0.412587
         Iterations 8
                      coef  std err      z  P>|z|
intercept           5.6628    0.931  6.085  0.000
Pclass             -1.7635    0.330 -5.351  0.000
Sex[male]          -5.0395    1.069 -4.713  0.000
SibSp               2.1085    0.690  3.056  0.002
Pclass:SexNr        1.1316    0.348  3.250  0.001
Pclass:SibSp       -0.8464    0.230 -3.685  0.000
Pclass:Parch        0.1982    0.097  2.034  0.042
Pclass:Fare         0.0043    0.002  1.753  0.080
AgeNoMissing:SexNr -0.0302    0.011 -2.732  0.006
AgeNoMissing:Parch -0.0271    0.008 -3.489  0.000
SexNr:Parch         0.6612    0.244  2.707  0.007
SibSp:Fare         -0.0085    0.003 -2.997  0.003


Results of Multiple Logistic Regression without interaction terms:

- Significant were Pclass, Sex, Age, SibSp

Results of Multiple Logistic Regression without itneraction terms:

- Significant are Pclass, Sex, SibSp, Pclass:SexNr, Pclass:SibSp, Pclass:Parch, Pclass:Fare, AgeNoMissing:SexNr, AgeNoMissing:Parch, SexNr:Parch, SibSp:Fare
- AgeNoMissing has become insignificant

## Multiple Logistic Regression with Polynomial Functions