 # DATA ANALYSIS


In [676]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

We load the data, splitting it into features and target variable.

In [677]:
data = pd.read_csv("data/responses.csv")
x = data.drop("Smoking", axis=1)
y = data["Smoking"]

Quick look at how the data is composed.

In [678]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Columns: 149 entries, Music to House - block of flats
dtypes: float64(134), int64(5), object(10)
memory usage: 1.1+ MB


In [679]:
x.describe(exclude=np.number)

Unnamed: 0,Alcohol,Punctuality,Lying,Internet usage,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
count,1005,1008,1008,1010,1004,1007,1009,1008,1006,1006
unique,3,3,4,4,2,2,6,2,2,2
top,social drinker,i am always on time,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats
freq,659,399,549,744,593,906,621,754,707,595


# Encoding of categorical variables

We transform categorical variables into numerical ones.

In [680]:
x["Alcohol"] = x["Alcohol"].map({"never": 0, "social drinker": 1, "drink a lot": 2})
x["Punctuality"] = x["Punctuality"].map({"i am often early": 0, "i am always on time": 1, "i am often running late": 2})
x["Lying"] = x["Lying"].map({"never": 0, "sometimes": 1, "only to avoid hurting someone": 2, "everytime it suits me": 3})
x["Internet usage"] = x["Internet usage"].map({
    "no time at all": 0, 
    "less than an hour a day": 1, 
    "few hours a day": 2, 
    "most of the day": 3})
x["Education"] = x["Education"].map({
    "currently a primary school pupil": 0,
    "primary school": 1, 
    "secondary school": 2, 
    "college/bachelor degree": 3, 
    "masters degree": 4,
    "doctorate degree": 5})

In [681]:
cat_var_mask = x.dtypes == object
cat_var = x.columns[cat_var_mask]
x = pd.get_dummies(x)

# Missing values

Let's start from the target. How many values are missing?

In [682]:
y.isnull().sum()

8

In the rows with a missing value for target, what other values are missing?

In [683]:
for index, row in x[y.isnull()].iterrows():
    print("Index: %d - Missing: "% index, end=" ")
    for col in row[row.isnull()].index:
        print(col, end=" - ")
    print()

Index: 130 - Missing:  Documentary - Alcohol - 
Index: 149 - Missing:  
Index: 525 - Missing:  Fun with friends - Friends versus money - Hypochondria - 
Index: 567 - Missing:  Documentary - Biology - Giving - Charity - Spending on looks - 
Index: 629 - Missing:  Alcohol - Healthy eating - 
Index: 708 - Missing:  Alcohol - Healthy eating - 
Index: 763 - Missing:  Geography - 
Index: 980 - Missing:  Passive sport - 


We delete the rows with missing values in target.

In [684]:
x = x.drop(x[y.isnull()].index)
y = y.drop(y[y.isnull()].index)
print("Missing values in target: %d" % y.isnull().sum())

Missing values in target: 0


In [685]:
# Check for shape consistency
x.shape[0] == y.shape[0]

True

Now we map all the smoking categories with appropriate values.

In [686]:
smoker_mapping = {"never smoked": 0, "tried smoking": 1, "former smoker": 3, "current smoker": 4}
y = y.map(smoker_mapping)
print(y.value_counts())

1    430
0    208
4    189
3    175
Name: Smoking, dtype: int64


Let's check how many missing values do we have.

In [687]:
x.isnull().sum().sort_values(ascending=False)

Height                                   20
Weight                                   20
Passive sport                            14
Chemistry                                10
Punk                                      8
Latino                                    8
Geography                                 8
Theatre                                   8
Compassion to animals                     7
Final judgement                           7
Countryside, outdoors                     7
Gardening                                 7
Criminal damage                           7
Classical music                           7
Age                                       7
Reggae, Ska                               7
Rock n roll                               7
Alternative                               7
Daily events                              7
Techno, Trance                            7
Art exhibitions                           6
Reading                                   6
Writing                         

Are there people that did not compile the questionary in a thorough way?

In [688]:
miss = x.apply(lambda x: len(x) - x.count(), axis=1)
miss.describe()

count    1002.000000
mean        0.563872
std         1.108685
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         8.000000
dtype: float64

Missing values may indicate questionaries compiled quickly and carelessly. Given that the number of questionaries with more than 2 missing answers is relatively low, we remove them.

In [689]:
x = x.loc[miss <= 2]
y = y.loc[miss <= 2]
x.shape

(944, 154)

In [690]:
x.isnull().sum().sort_values(ascending=False)

Height                                   11
Weight                                   10
Passive sport                             8
Criminal damage                           6
Geography                                 6
Chemistry                                 6
Theatre                                   6
Classical music                           5
Punk                                      5
Compassion to animals                     5
Daily events                              5
Gardening                                 5
Workaholism                               4
Latino                                    4
Writing                                   4
Techno, Trance                            4
Spiders                                   4
Final judgement                           4
Art exhibitions                           4
Pets                                      4
Alternative                               4
Rock n roll                               4
Rock                            

There are still a lot of missing values, so we will try to impute those.
We substitute missing values with the most frequent value in its column.

In [691]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [692]:
x = pd.DataFrame(data=imp.fit_transform(x, y), columns=x.columns, index=x.index)
x.isnull().sum().sort_values(ascending=False)

House - block of flats_house/bungalow    0
Religion                                 0
Gardening                                0
Active sport                             0
Passive sport                            0
Writing                                  0
Musical instruments                      0
Dancing                                  0
Countryside, outdoors                    0
Art exhibitions                          0
Economy Management                       0
Cars                                     0
Law                                      0
Medicine                                 0
Foreign languages                        0
Geography                                0
Reading                                  0
Chemistry                                0
Celebrities                              0
Shopping                                 0
Science and technology                   0
Theatre                                  0
Healthy eating                           0
Alcohol    

Now we save the cleaned data into a file, ready to be used for modeling.

In [693]:
x['Smoking'] = y
x.to_csv("data/clean_data.csv")