 # DATA ANALYSIS


In [151]:
import pandas as pd
import numpy as np

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

### Acquiring data

In [152]:
data = pd.read_csv("data/responses.csv")

Quick look at how the data is composed.

In [153]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Columns: 150 entries, Music to House - block of flats
dtypes: float64(134), int64(5), object(11)
memory usage: 1.2+ MB


### Target variable

Our target variable is "Smoking". We check its properties.

In [154]:
y = data["Smoking"]
y.describe()

count              1002
unique                4
top       tried smoking
freq                430
Name: Smoking, dtype: object

How many missing values does it have?

In [155]:
print("Number of missing values in target: %d" % y.isnull().sum())

Number of missing values in target: 8


In the rows with a missing value for the target variable "Smoking", what values are missing?

In [156]:
values_missing = dict()
for index, row in data[y.isnull()].iterrows():
    print("Index: %d - Missing: "% index, end=" ")
    for col in row[row.isnull()].index:
        print(col, end=" - ")
        if col in values_missing:
            values_missing[col] += 1
        else:
            values_missing[col] = 1
    print()
print("-" * 40)
for key in sorted(values_missing, key=values_missing.get, reverse=True):
    print("%s: %d" % (key, values_missing[key]))

Index: 130 - Missing:  Documentary - Smoking - Alcohol - 
Index: 149 - Missing:  Smoking - 
Index: 525 - Missing:  Fun with friends - Smoking - Friends versus money - Hypochondria - 
Index: 567 - Missing:  Documentary - Biology - Smoking - Giving - Charity - Spending on looks - 
Index: 629 - Missing:  Smoking - Alcohol - Healthy eating - 
Index: 708 - Missing:  Smoking - Alcohol - Healthy eating - 
Index: 763 - Missing:  Geography - Smoking - 
Index: 980 - Missing:  Passive sport - Smoking - 
----------------------------------------
Smoking: 8
Alcohol: 3
Documentary: 2
Healthy eating: 2
Fun with friends: 1
Friends versus money: 1
Hypochondria: 1
Biology: 1
Giving: 1
Charity: 1
Spending on looks: 1
Geography: 1
Passive sport: 1


Interesting to notice:
- Three people that chose to not indicate their smoking habits also avoided talking about alcohol consumption.
- Two of them also avoided talking about health habits.

We remove the rows with missing values for the target.

In [157]:
data = data.drop(data[y.isnull()].index)
print("Number of missing values in target: %d" % data["Smoking"].isnull().sum())

Number of missing values in target: 0


### Encoding of categorical variables

We transform categorical variables into numerical ones.

In [158]:
data.describe(exclude=np.number)

Unnamed: 0,Smoking,Alcohol,Punctuality,Lying,Internet usage,Gender,Left - right handed,Education,Only child,Village - town,House - block of flats
count,1002,1000,1000,1000,1002,996,999,1001,1000,998,998
unique,4,3,3,4,4,2,2,6,2,2,2
top,tried smoking,social drinker,i am always on time,sometimes,few hours a day,female,right handed,secondary school,no,city,block of flats
freq,430,658,396,544,737,589,899,617,747,702,590


In [159]:
data["Alcohol"] = data["Alcohol"].map({"never": 0, "social drinker": 1, "drink a lot": 2})
data["Punctuality"] = data["Punctuality"].map({"i am often early": 0, "i am always on time": 1, "i am often running late": 2})
data["Lying"] = data["Lying"].map({"never": 0, "sometimes": 1, "only to avoid hurting someone": 2, "everytime it suits me": 3})
data["Internet usage"] = data["Internet usage"].map({
    "no time at all": 0, 
    "less than an hour a day": 1, 
    "few hours a day": 2, 
    "most of the day": 3})
data["Gender"] = data["Gender"].map({"male": 0, "female": 1})
data["Left - right handed"] = data["Left - right handed"].map({"left handed": 0, "right handed": 1})
data["Education"] = data["Education"].map({
    "currently a primary school pupil": 0,
    "primary school": 1, 
    "secondary school": 2, 
    "college/bachelor degree": 3, 
    "masters degree": 4,
    "doctorate degree": 5})
data["Only child"] = data["Only child"].map({"no": 0, "yes": 1})
data["Village - town"] = data["Village - town"].map({"village": 0, "city": 1})
data["House - block of flats"] = data["House - block of flats"].map({"house/bungalow": 0, "block of flats": 1})

We try to determinate if a person has been smoking during a period of his life.

In [160]:
# Target variable
data["Smoking"] = data["Smoking"].map({
    "never smoked": 0, 
    "tried smoking": 0, 
    "former smoker": 1, 
    "current smoker": 1})

### Correlation

What are the most heavily correlated features?

In [161]:
corr = data.corr()

for row in range(corr.shape[0]):
    for col in range(corr.iloc[row].shape[0]):
        if abs(corr.iloc[row][col]) > 0.5 and row < col:
            print("%s & %s: %.4f" % (corr.index[row], corr.index[col], corr.iloc[row][col]))

Classical music & Opera: 0.5948
Rock & Metal or Hardrock: 0.5290
Rock & Punk: 0.5081
Metal or Hardrock & Punk: 0.5446
Horror & Thriller: 0.5055
Fantasy/Fairy tales & Animated: 0.6757
Mathematics & Physics: 0.5935
Biology & Chemistry: 0.6901
Biology & Medicine: 0.7164
Chemistry & Medicine: 0.6260
Art exhibitions & Theatre: 0.5345
Religion & God: 0.5112
Shopping & Shopping centres: 0.6490
Shopping & Spending on looks: 0.5075
Storm & Darkness: 0.5066
Snakes & Rats: 0.5723
Fear of public speaking & Public speaking: 0.5068
Life struggles & Gender: 0.5678
Age & Education: 0.6179
Height & Weight: 0.6957
Height & Gender: -0.6850
Weight & Gender: -0.6443
Village - town & House - block of flats: 0.6031


Which features are more correlated with our target variable, Smoking?

In [162]:
corr_target = corr["Smoking"]
corr_target = corr_target.sort_values(ascending=False)
# Selection of the most correlated features.
relevant_features = corr_target[abs(corr_target)>0.1]
relevant_features

Smoking                   1.000000
Alcohol                   0.272063
Entertainment spending    0.185594
Cheating in school        0.174262
Hiphop, Rap               0.150029
Horror                    0.138637
Criminal damage           0.125700
Number of friends         0.119906
Spending on looks         0.117810
Age                       0.106776
Comedy                   -0.105114
God                      -0.105399
Classical music          -0.107466
Chemistry                -0.107610
Thinking ahead           -0.113683
Workaholism              -0.118797
Mathematics              -0.121718
Parents' advice          -0.129236
Finances                 -0.173211
Name: Smoking, dtype: float64

A lot of interesting things emerge from this table.

We can observe that 'Alcohol' has the highest correlation value (0.27). 'Cheating in school', 'Rap culture', 'Appearances' and 'Crime' also have a positive correlation.

On the other end, 'Listening to the parental advice', 'Caring about finances' and 'God' have a negative correlation.

#### An odd fact

While looking at these numbers, I noticed something odd. It is popular knowledge that men are usually more inclined to smoking than women, but this does not show in our dataset.

In [163]:
corr_target['Gender']

0.02263756948527801

Actually, 'Gender' has only the 61st highest correlation value! 

In [164]:
corr_target.index.get_loc("Gender")

59

After some research, I found out that in Slovakia, the country where the survey that generated the dataset took place, [39.7%](https://tradingeconomics.com/slovakia/smoking-prevalence-males-percent-of-adults-wb-data.html) of the men were smokers during 2015, while only [17.6%](https://tradingeconomics.com/slovakia/smoking-prevalence-females-percent-of-adults-wb-data.html) of females were smokers.

Are those percentages respected in our dataset?

In [165]:
group = data.groupby("Gender").mean()["Smoking"]
print(group)

Gender
0.0    0.351351
1.0    0.373514
Name: Smoking, dtype: float64


In our dataset 37% of the men are smokers, which is on par with the national average. But women 

# Missing values

How many missing values do we still have?

In [166]:
data.isnull().sum().sum()

584

There are still a lot of missing values, so we will try to impute those.
We substitute missing values with the most frequent value in its column.

In [167]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [168]:
data = pd.DataFrame(data=imp.fit_transform(data), columns=data.columns, index=data.index)
data.isnull().sum().sum()

0

### Binning of continuous variables

In [169]:
# Continuous variable bins
data['AgeBin'] = pd.cut(data['Age'], 5)
data['HeightBin'] = pd.qcut(data['Height'].astype(int), 5)
data['WeightBin'] = pd.qcut(data['Weight'].astype(int), 5)

# Convert objects to categories
data['Age'] = LabelEncoder().fit_transform(data['AgeBin'])
data['Height'] = LabelEncoder().fit_transform(data['HeightBin'])
data['Weight'] = LabelEncoder().fit_transform(data['WeightBin'])

# Drop unnecessary features
drop_features = ['AgeBin', 'HeightBin', 'WeightBin']
data = data.drop(drop_features, axis=1)

Now we save the cleaned data into a file, ready to be used for modeling.

In [170]:
# data = data[relevant_features.index]
data.to_csv("data/clean_data.csv")