In [73]:
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Define the problem

**Given**: Data from the Irish National Household Survey for the year 2015. The data contains information on the respondents' labour status (are they unemployed/working/studying etc), and a range of variables about their educational and demographic background (such as education level, education field, age, marital status, region of residency etc). 

**Goal**: Categorise respondents into unemployed/employed based on information about their educational/demographic background. 

    

## Prepare the data (Data Preprocessing)

### Load the data

In [74]:
path_to_file = "QNHS_2015.csv"
df=pd.read_csv(path_to_file)


### Get insight of the data (Prepare and clean)

In [75]:
print(df.shape)

df.head(5)
# print(data_df.columns.values.tolist())


(95986, 11)


Unnamed: 0,mainstat,sex,marstat,national_summary_x1,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
0,7.0,2,2,,1.0,1.0,600.0,,8,2,1
1,1.0,1,2,,1.0,1.0,600.0,,11,2,1
2,9.0,2,1,,1.0,1.0,999.0,9999.0,3,2,1
3,9.0,2,1,,1.0,1.0,999.0,9999.0,2,2,1
4,9.0,2,1,,1.0,1.0,999.0,9999.0,2,2,1


In [76]:
# df.info()
df.groupby('mainstat').count()

Unnamed: 0_level_0,sex,marstat,national_summary_x1,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
mainstat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,38630,38630,0,38628,38630,37432,37056,38630,38630,38630
2.0,4695,4695,0,4694,4695,4557,4595,4695,4695,4695
3.0,8736,8736,0,8735,8735,8630,8630,8736,8736,8736
4.0,9289,9289,0,9289,9289,9115,9120,9289,9289,9289
5.0,2741,2741,0,2741,2741,2643,2691,2741,2741,2741
7.0,9852,9852,0,9851,9852,9612,9697,9852,9852,9852
8.0,515,515,0,515,515,483,502,515,515,515
9.0,21483,21483,0,21483,21483,21483,21483,21483,21483,21483


In [77]:
#delete all rows where mainstat is not=1 or 2
mask=np.array((df["mainstat"]==1) | (df["mainstat"]==2))
df=df[mask]
df

Unnamed: 0,mainstat,sex,marstat,national_summary_x1,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
1,1.0,1,2,,1.0,1.0,600.0,,11,2,1
7,1.0,1,1,,1.0,1.0,100.0,9999.0,13,9,1
8,1.0,1,1,,1.0,1.0,100.0,9999.0,14,9,1
10,1.0,1,2,,1.0,1.0,304.0,0.0,13,1,2
11,1.0,2,1,,1.0,1.0,304.0,0.0,7,3,2
...,...,...,...,...,...,...,...,...,...,...,...
95974,1.0,1,2,,1.0,1.0,700.0,300.0,12,2,2
95979,1.0,1,1,,1.0,1.0,700.0,300.0,7,1,2
95980,1.0,1,1,,1.0,1.0,800.0,500.0,7,1,2
95981,1.0,1,2,,1.0,1.0,100.0,9999.0,15,1,1


In [78]:
#old index stands, so now it is not continuous
df=df.reset_index()
df.head()

Unnamed: 0,index,mainstat,sex,marstat,national_summary_x1,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
0,1,1.0,1,2,,1.0,1.0,600.0,,11,2,1
1,7,1.0,1,1,,1.0,1.0,100.0,9999.0,13,9,1
2,8,1.0,1,1,,1.0,1.0,100.0,9999.0,14,9,1
3,10,1.0,1,2,,1.0,1.0,304.0,0.0,13,1,2
4,11,1.0,2,1,,1.0,1.0,304.0,0.0,7,3,2


In [79]:
#old index has been made a column, so delete the first column
df=df.drop("index", axis=1)

In [80]:
#remove missing observations
df.describe()
# df.info()


Unnamed: 0,mainstat,sex,marstat,national_summary_x1,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
count,43325.0,43325.0,43325.0,0.0,43322.0,43325.0,41989.0,41651.0,43325.0,43325.0,43325.0
mean,1.108367,1.448609,1.735834,,1.35093,1.349036,420.498893,2353.545485,9.076746,2.980912,1.753122
std,0.310847,0.497358,0.711234,,1.049125,1.042069,177.402597,3963.359885,2.518053,2.602968,0.4312
min,1.0,1.0,1.0,,1.0,1.0,0.0,0.0,4.0,1.0,1.0
25%,1.0,1.0,1.0,,1.0,1.0,304.0,0.0,7.0,2.0,2.0
50%,1.0,1.0,2.0,,1.0,1.0,400.0,440.0,9.0,2.0,2.0
75%,1.0,2.0,2.0,,1.0,1.0,600.0,800.0,11.0,3.0,2.0
max,2.0,2.0,4.0,,6.0,6.0,800.0,9999.0,18.0,9.0,2.0


In [81]:
#conclude that: mainstat, sex and marstat have no null observations. national_summary_x1 is an empty column
df=df.drop("national_summary_x1", axis=1)
df



Unnamed: 0,mainstat,sex,marstat,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
0,1.0,1,2,1.0,1.0,600.0,,11,2,1
1,1.0,1,1,1.0,1.0,100.0,9999.0,13,9,1
2,1.0,1,1,1.0,1.0,100.0,9999.0,14,9,1
3,1.0,1,2,1.0,1.0,304.0,0.0,13,1,2
4,1.0,2,1,1.0,1.0,304.0,0.0,7,3,2
...,...,...,...,...,...,...,...,...,...,...
43320,1.0,1,2,1.0,1.0,700.0,300.0,12,2,2
43321,1.0,1,1,1.0,1.0,700.0,300.0,7,1,2
43322,1.0,1,1,1.0,1.0,800.0,500.0,7,1,2
43323,1.0,1,2,1.0,1.0,100.0,9999.0,15,1,1


In [82]:
df.head()
df.groupby('national_summary_x3').count()

Unnamed: 0_level_0,mainstat,sex,marstat,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
national_summary_x3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,38201,38201,38201,38201,37487,36780,38201,38201,38201
2.0,773,773,773,773,729,735,773,773,773
3.0,700,700,700,700,636,655,700,700,700
4.0,2567,2567,2567,2567,2155,2470,2567,2567,2567
5.0,76,76,76,76,74,70,76,76,76
6.0,1005,1005,1005,1005,905,938,1005,1005,1005


In [83]:
df.head()

Unnamed: 0,mainstat,sex,marstat,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
0,1.0,1,2,1.0,1.0,600.0,,11,2,1
1,1.0,1,1,1.0,1.0,100.0,9999.0,13,9,1
2,1.0,1,1,1.0,1.0,100.0,9999.0,14,9,1
3,1.0,1,2,1.0,1.0,304.0,0.0,13,1,2
4,1.0,2,1,1.0,1.0,304.0,0.0,7,3,2


In [113]:
# replace NaN values
def replace_NaN_values(column_name):
    
    replace_values = df[column_name].mode()
    print(f'replace_values: {replace_values}')

    # print count of NaN values
    df[column_name].isna().sum()

    df[column_name].fillna(value=replace_values[0], inplace=True)
    # new_df['national_summary_x3'].isna().sum()


    print(df[column_name].isna().sum())

# replace_NaN_values(column_name='national_summary_x3')
# replace_NaN_values(column_name='national_summary_x3')
    
    
columns_to_replace = ['national_summary_x3', 'hatlevel']

for c in columns_to_replace:
    replace_NaN_values(column_name=c)




replace_values: 0    1.0
dtype: float64
0
replace_values: 0    304.0
dtype: float64
0


In [114]:
# def cap(l):
#     return l.capitalize()
    
# cap('a')

letters = ['a','b']

l_new = map(lambda l:l.capitalize(), letters)
list(l_new)

['A', 'B']

In [115]:
df

Unnamed: 0,mainstat,sex,marstat,national_summary_x3,national_summary,hatlevel,hatfield,ageclass,familytype_summary,region
0,1.0,1,2,1.0,1.0,600.0,,11,2,1
1,1.0,1,1,1.0,1.0,100.0,9999.0,13,9,1
2,1.0,1,1,1.0,1.0,100.0,9999.0,14,9,1
3,1.0,1,2,1.0,1.0,304.0,0.0,13,1,2
4,1.0,2,1,1.0,1.0,304.0,0.0,7,3,2
...,...,...,...,...,...,...,...,...,...,...
43320,1.0,1,2,1.0,1.0,700.0,300.0,12,2,2
43321,1.0,1,1,1.0,1.0,700.0,300.0,7,1,2
43322,1.0,1,1,1.0,1.0,800.0,500.0,7,1,2
43323,1.0,1,2,1.0,1.0,100.0,9999.0,15,1,1


#### Clean

#### Visualize with Seaborn

In [None]:
sns.set(style="ticks", color_codes=True)
sns.pairplot(df, 
             x_vars=['TV','radio','newspaper'],              
             y_vars='sales',
             palette=sns.hls_palette(6, l=.6, s=.8),             
             markers=["o"],
             plot_kws={'line_kws':{'color':'#FFAAAA'},
                       'scatter_kws':{'facecolors':'#AAFFAA'}},
             size=5, aspect=1,kind='reg')


In [None]:
# show correlations
df.corr()

In [None]:
sns.heatmap(df.corr(),annot=True, cmap="Reds")


## Select features

From the data inspection being made, I'll decide to use the 'TV' ads as a feature.    

In [None]:
X = df[['TV', 'newspaper', 'radio']]
y = df['sales']
print(y.shape)

## Separate the training data from the test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(f'X_train: {X_train[:5]}\n', f'y_train: {y_train[:5]}\n')
print(f'X_test: {X_test[:5]}\n', f'y_test: {y_test[:5]}\n')

## Choose the model

We saw some linear correlation between 'TV' ads and sales, that's why we are going to start with LinearRegression.

In [None]:
from sklearn.linear_model import LinearRegression

## Train the model (fit the model)

In [None]:
# instantiate the model
model = LinearRegression()

In [None]:
fitted = model.fit(X_train,y_train)

In [None]:
# let's check the "learned" co-efficients:
print(fitted.intercept_)
print(fitted.coef_)

### Interpret the coefficients

## Predict (classify unknown input sample)

In [None]:
y_pred = fitted.predict(X_test)

## Evaluate the model

In [None]:
from sklearn import metrics

In [None]:
# get metrics:
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
r2 = metrics.r2_score(y_test, y_pred)

# print it
print('{:10s}: {}'.format('MAE: ', mae ) )
print('{:10s}: {}'.format('MSE: ', mse ) )
print('{:10s}: {}'.format('RMSE: ', rmse) )
print('{:10s}: {}'.format('r2_score', r2) )