In [1]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

### Options

In [2]:
pd.options.display.max_rows = 999

### Reading the data

In [3]:
data = pd.read_csv('train.csv')

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Analyzing the data

In [5]:
data.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Mapping columns

<h4>Converting sex column to 0-female and 1-male</h4>

In [6]:
data['Sex'] = data['Sex'].map({
    'male': 1,
    'female': 0
})

<h4>Converting Embarked column S-0, C-1, Q-2</h4>

In [7]:
data['Embarked'] = data['Embarked'].map({
    'S': 0,
    'C': 1,
    'Q': 2
})

In [8]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,0.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,0.0


### Checking for VIF or variance inflation factor

<h4>Removing unnecessary columns</h4>

In [9]:
columns_for_vif = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [10]:
data_to_vif = data[columns_for_vif] 

In [11]:
data_to_vif.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,0.0
1,1,1,0,38.0,1,0,71.2833,1.0
2,1,3,0,26.0,0,0,7.925,0.0
3,1,1,0,35.0,1,0,53.1,0.0
4,0,3,1,35.0,0,0,8.05,0.0


<h4>Removing NaN rows from data to vif</h4>

In [12]:
data_to_vif.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [13]:
data_to_vif = data_to_vif.dropna()

In [14]:
data_to_vif.shape

(712, 8)

<h4>Variance Inflation Factor Algorithm</h4>

In [15]:
vif_list = []
number_of_columns_vif = data_to_vif.shape[1]

In [16]:
for i in range(number_of_columns_vif):
    vif_list.append(variance_inflation_factor(data_to_vif.values, i))

<h4>Creating a DataFrame table to see clearly</h4>

In [17]:
df_vif = pd.DataFrame({
    'Features': data_to_vif.columns.tolist(),
    'Vif': vif_list
})

In [18]:
df_vif.sort_values('Vif', ascending=False)

Unnamed: 0,Features,Vif
1,Pclass,4.910167
3,Age,3.99661
2,Sex,3.514574
0,Survived,1.956778
6,Fare,1.945074
4,SibSp,1.623379
5,Parch,1.617232
7,Embarked,1.298661


### Building Linear Regression

<h4>Defining inputs and output</h4>

In [19]:
inputs = ['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked']
output = 'Survived'

<h4>Defining x, y</h4>

In [20]:
data_regression = data_to_vif

In [21]:
x = data_regression[inputs]
y = data_regression[output]

<h4>Building a linear regression</h4>

In [22]:
reg = LinearRegression()
reg.fit(x, y)

<h4>Gaining the datas, coefficients, r-square, intercept</h4>

In [23]:
coefs = reg.coef_
r = reg.score(x, y)
intercept = reg.intercept_

<h4>Calculating adjusted R-squared</h4>

In [24]:
n = data_regression.shape[0]
p = data_regression.shape[1]

In [25]:
adjR = 1-(1-r)*(n-1)/(n-p-1)

<h4>Creating a DataFrame for the information that we gained</h4>

In [26]:
df_summary = pd.DataFrame({
    'Features': ['Pclass', 'Age', 'Sex', 'Fare', 'SibSp', 'Parch', 'Embarked', 'Intercept', 'R-square', 'Adjusted-R'],
    'Constants': [coefs[0], coefs[1], coefs[2], coefs[3], coefs[4], coefs[5], coefs[6], intercept, r, adjR]
})

In [27]:
df_summary

Unnamed: 0,Features,Constants
0,Pclass,-0.193754
1,Age,-0.006563
2,Sex,-0.485292
3,Fare,0.000265
4,SibSp,-0.053269
5,Parch,-0.010863
6,Embarked,0.022538
7,Intercept,1.358857
8,R-square,0.398956
9,Adjusted-R,0.392116


<h4>Our mathematical model</h4>

$y = 1.358857 -0.193754(Pclass) - 0.006563(Age) - 0.485292(Sex) + 0.000265(Fare) - 0.053269(SibSp) - 0.010863(Parch) + 0.022538(Embarked)$

### Predicting the data

<h4>Loading testing data</h4>

In [28]:
raw_test_data = pd.read_csv('test.csv')

<h4>Preparing the data</h4>

In [29]:
test_data = raw_test_data[inputs]

In [30]:
test_data.head()

Unnamed: 0,Pclass,Age,Sex,Fare,SibSp,Parch,Embarked
0,3,34.5,male,7.8292,0,0,Q
1,3,47.0,female,7.0,1,0,S
2,2,62.0,male,9.6875,0,0,Q
3,3,27.0,male,8.6625,0,0,S
4,3,22.0,female,12.2875,1,1,S


In [31]:
test_data['Embarked'] = test_data['Embarked'].map({
    'S': 0,
    'C': 1,
    'Q': 2
})

test_data['Sex'] = test_data['Sex'].map({
    'male': 1,
    'female': 0
})

test_data = test_data.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Embarked'] = test_data['Embarked'].map({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Sex'] = test_data['Sex'].map({


<h4>Predicting the data</h4>

In [32]:
predictions = reg.predict(test_data)
predictions = predictions.round()

In [33]:
test_data['Survived (Prediction)'] = predictions

In [34]:
test_data

Unnamed: 0,Pclass,Age,Sex,Fare,SibSp,Parch,Embarked,Survived (Prediction)
0,3,34.5,1,7.8292,0,0,2,0.0
1,3,47.0,0,7.0,1,0,0,0.0
2,2,62.0,1,9.6875,0,0,2,0.0
3,3,27.0,1,8.6625,0,0,0,0.0
4,3,22.0,0,12.2875,1,1,0,1.0
5,3,14.0,1,9.225,0,0,0,0.0
6,3,30.0,0,7.6292,0,0,2,1.0
7,2,26.0,1,29.0,1,1,0,0.0
8,3,18.0,0,7.2292,0,0,1,1.0
9,3,21.0,1,24.15,2,0,0,0.0
