# Pandas morning warmup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Exercise 1
- load the `../data/titanic-train.csv` onto a variable called `df`
- drop the 2 records missing cases on Embarked
- create a new feature called `Fare2` that is an integer version of `Fare`
- plot the distribution of `Fare2` using a histogram
- create a new feature called `Fare3` that is the logarithm of `Fare` (you may have to add a small quantity to avoid log of zero)
- plot the distribution of `Fare3` using a histogram
- create a column called `Male` that is 1 if `Sex` is `male` and 0 otherwise
- create dummy columns for `Pclass` and join them back to the main df
- fill missing age data with one of these strategies
    - fixed value
    - mean value of age
    - random age data based on existing age's mean and std
- Count how many passengers paid between 10 and 50?
- Create new categorical variable for `Fare2` with 3 buckets:
    - Fare<=10
    - Fare11to50
    - Fare51+
- convert it to dummy columns and join it to the dataframe
- create a new feature for the presence of Family combining the information present in SibSp and in Parch. If a person has a SibSp or a Parch then he/she has a Family
- save your final file to a local file in any format that is not csv (json, hdf5, excel, ...)

In [None]:
df = pd.read_csv('../data/titanic-train.csv', index_col=None, header=0)

In [None]:
# drop the 2 records missing cases on Embarked
df = df[df['Embarked'].notnull()]

In [None]:
# change fare to integer
df['Fare2'] = df['Fare'].astype(int)

In [None]:
# Plot the distribution of Fare2 using a histogram
df[['Fare2']].plot(kind='hist')

In [None]:
# change fare to integer
df['Fare3'] = np.log(df['Fare']+0.1)

In [None]:
# Plot the distribution of Fare2 using a histogram
df['Fare3'].plot(kind='hist', bins=15)

In [None]:
# dummy var for Gender == male {male: 1, female: 0}
df['Male'] = 0
df.loc[df['Sex'] == 'male', 'Male'] = 1

In [None]:
# create dummy columns for `Pclass` and join them back to the main df
dum1 = pd.get_dummies(df['Pclass'], prefix='Pclass')
df = df.join(dum1)

In [None]:
# random age data based on existing age's mean and std
seed = np.random.RandomState(1)
df['Age2'] = df['Age'].apply(lambda x: seed.normal(df.Age.mean(),
                                                   df.Age.std())
                             if np.isnan(x) else x)
df['Age2'] = df['Age2'].astype(int)

In [None]:
# Count how many passengers paid between 10 and 50?
df.loc[(df.Fare <= 50) & (df.Fare >= 10), 'Fare2'].count()

In [None]:
# Create new categorical variable for Fare2 with 3 buckets
# - Fare<=10
# - Fare11to50
# - Fare51+
# convert it to dummy columns and join it to the dataframe
df['Fare4'] = ''
df.loc[(df.Fare2 <= 10), 'Fare4'] = 'Fare<=10'
df.loc[(df.Fare2 <= 50) & (df.Fare2 > 10), 'Fare4'] = 'Fare11to50'
df.loc[(df.Fare2 > 50), 'Fare4'] = 'Fare51+'
dum = pd.get_dummies(df['Fare4'], prefix='Fare3')
df = df.join(dum)


In [None]:
# create a new feature for the presence of Family combining the information
#    present in SibSp and in Parch. If a person has a SibSp or a Parch then
#    he/she has a Family
df['Family'] = (df['SibSp'] > 0) | (df['Parch'] > 0)

In [None]:
df.head()

### Exercise 2

- reload the churn dataset (`../data/churn.csv`)
- assign the Churn column to a variable called `y`
- separate numerical columns like we did yesterday
- convert the remaining categorical columns to booleans using `pd.get_dummies`
- compare the score of a classification using only the numerical columns VS numerical + dummies (for this you'll have to do a train/test split)
- how much do the dummies contribute to the score? Lots or little?

In [None]:
df = pd.read_csv('../data/churn.csv')
y = df['Churn'] == 'Yes'
features = df.drop('Churn', axis=1)

In [None]:
categorical_data = features.select_dtypes(include=['object'])
numerical = features.select_dtypes(include=['number'])
dummies = pd.get_dummies(categorical_data)
all_features = pd.concat([numerical, dummies], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Xn_train, Xn_test, Xa_train, Xa_test, y_train, y_test = \
    train_test_split(numerical, all_features, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
for model in [LogisticRegression(), DecisionTreeClassifier()]:
    print(model)
    model.fit(Xn_train, y_train)
    sn = model.score(Xn_test, y_test)
    model.fit(Xa_train, y_train)
    print("Score on numerical: %0.2f" % sn)
    sa = model.score(Xa_test, y_test)
    print("Score on all features: %0.2f" % sa)
    print("Percentage improvement: %0.2f %%" % (100 * (sa-sn)/sn))
    print()

*Copyright &copy; 2017 CATALIT LLC.  All rights reserved.*