In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
df = pd.read_csv("COVID-19BehaviorData_CAN2022.csv", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6430 entries, 0 to 6429
Columns: 512 entries, RecordNo to future_3
dtypes: float64(1), int64(3), object(508)
memory usage: 25.1+ MB


In [4]:
# remove empty columns
df1 = df
# strip all white space for string values
df1 = df1.map(lambda x: x.strip() if isinstance(x, str) else x)
# replace all empty strings to NaN
df1.replace('', np.nan, inplace=True)
# replace all __NA__ strings to NaN
df1.replace('__NA__', np.nan, inplace=True)
# drop columns with only NaN values
df_cleaned = df1.dropna(axis=1, thresh = len(df1)*0.55)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6430 entries, 0 to 6429
Data columns (total 68 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RecordNo            6430 non-null   int64  
 1   endtime             6430 non-null   object 
 2   qweek               6430 non-null   object 
 3   i2_health           4423 non-null   object 
 4   i9_health           6430 non-null   object 
 5   i11_health          6430 non-null   object 
 6   i12_health_1        6430 non-null   object 
 7   i12_health_2        5429 non-null   object 
 8   i12_health_3        6430 non-null   object 
 9   i12_health_4        6430 non-null   object 
 10  i12_health_5        6430 non-null   object 
 11  i12_health_6        6430 non-null   object 
 12  i12_health_7        6430 non-null   object 
 13  i12_health_8        6430 non-null   object 
 14  i12_health_11       5429 non-null   object 
 15  i12_health_12       6430 non-null   object 
 16  i12_he

In [5]:
# load the instructions file
ins = pd.read_excel("ins.xlsx")
ins.replace(np.nan, '', inplace=True)

In [6]:
# find the rows with valid_values and record their row number
valid_values_index = ins[ins.iloc[:, 0] == 'Valid Values'].index
print(valid_values_index)

Index([  17,  102,  112,  123,  131,  139,  147,  155,  163,  171,
       ...
       4781, 4789, 4797, 4811, 4819, 4830, 4841, 4852, 4860, 4871],
      dtype='int64', length=481)


In [7]:
# get the column names of our NAcolumn-free data
column_names = df_cleaned.columns
print(column_names)

Index(['RecordNo', 'endtime', 'qweek', 'i2_health', 'i9_health', 'i11_health',
       'i12_health_1', 'i12_health_2', 'i12_health_3', 'i12_health_4',
       'i12_health_5', 'i12_health_6', 'i12_health_7', 'i12_health_8',
       'i12_health_11', 'i12_health_12', 'i12_health_13', 'i12_health_14',
       'i12_health_15', 'i12_health_16', 'weight', 'gender', 'age', 'region',
       'household_size', 'household_children', 'employment_status',
       'cantril_ladder', 'PHQ4_1', 'PHQ4_2', 'PHQ4_3', 'PHQ4_4', 'WCRex2',
       'WCRex1', 'i12_health_22', 'i12_health_23', 'i12_health_25', 'r1_1',
       'r1_2', 'i12_health_26', 'i12_health_27', 'i12_health_28',
       'i12_health_29', 'Soc1_1', 'Soc1_2', 'Soc1_3', 'vac2_1', 'vac2_2',
       'vac2_3', 'vac7', 'r1_8', 'r1_9', 'vac2_7', 'vac', 'Vent_3',
       'vac_boost_1', 'vac_man_1', 'vac_man_2', 'vac_man_3', 'vac_man_4',
       'vac_man_5', 'vac_man_6', 'vac_man_7', 'vac_man_96', 'vac_man_99',
       'future_1', 'future_2', 'had_covid_2'],
    

In [8]:
# get the row number for these columns（included future 3)
column_index = ins[ins.iloc[:, 0].isin(column_names)].index
print(column_index)

Index([   0,    6,   12,   85,  203,  224,  236,  247,  258,  269,  280,  291,
        302,  313,  346,  357,  368,  379,  390,  401,  692,  698,  704,  723,
        731,  747,  760,  773,  808,  825,  836,  847,  858, 1688, 1710, 1721,
       1743, 1754, 1767, 2380, 3055, 3066, 3077, 3088, 3096, 3104, 3188, 3199,
       3210, 3371, 3509, 3522, 3548, 3568, 4144, 4615, 4728, 4736, 4744, 4752,
       4760, 4768, 4776, 4784, 4792, 4825, 4836, 4847],
      dtype='int64')


In [9]:
# Change our previous findings to lists
column_index_list = column_index.tolist()
valid_index_list = valid_values_index.tolist()

In [10]:
# define a funtion that can find the columns that have inputs
def filter_valid_col(col,val):
    valid_col = []
    valid_id = []
    for index in col:
        if (index + 5) in val:
            valid_col.append(index)
    return valid_col

In [11]:
# collect the columns that have inputs into a list
valid_columns = filter_valid_col(column_index_list,valid_index_list)
print(valid_columns)
print(len(valid_columns))

[12, 203, 224, 236, 247, 258, 269, 280, 291, 302, 313, 346, 357, 368, 379, 390, 401, 704, 723, 731, 747, 760, 773, 808, 825, 836, 847, 858, 1688, 1710, 1721, 1743, 1754, 1767, 2380, 3055, 3066, 3077, 3088, 3096, 3104, 3188, 3199, 3210, 3371, 3509, 3522, 3548, 3568, 4144, 4615, 4728, 4736, 4744, 4752, 4760, 4768, 4776, 4784, 4792, 4825, 4836, 4847]
63


In [12]:
# define a function that can find values for replacing
def find_value(col_id, value, ins_data):
    vvi = col_id+5
    last = len(ins_data)
    while (vvi < last) and ins_data.iloc[vvi,2] != '':
        if value == ins_data.iloc[vvi,2]:
            return int(ins_data.iloc[vvi,1])
        else:
            vvi = vvi+1
    return value

In [13]:
# define a function that can replace values according to instructions
def filler(oridata, columnidlist, ins_data):
    size = len(oridata)
    for index in columnidlist:
        # get the column name
        column_name = ins_data.iloc[index,0]
        for i in range(0,size):
            if oridata.at[i,column_name] != np.nan:
                oridata.at[i,column_name] = find_value(index,oridata.at[i,column_name],ins_data)
    return

In [14]:
# create a copy of df_cleaned and replace values with valid values according to instructions
newdata = df_cleaned
filler(newdata, valid_columns, ins)

In [15]:
# view our current data to see if our replacements are successful
newdata.head(20)

Unnamed: 0,RecordNo,endtime,qweek,i2_health,i9_health,i11_health,i12_health_1,i12_health_2,i12_health_3,i12_health_4,...,vac_man_3,vac_man_4,vac_man_5,vac_man_6,vac_man_7,vac_man_96,vac_man_99,future_1,future_2,had_covid_2
0,41589,06/01/2022 18:02,44,0,1,1,1,1,1,1,...,1,1,1,1,1,0,0,,,
1,41731,03/01/2022 23:48,44,1,2,1,1,1,1,1,...,1,1,1,1,1,0,0,,,
2,41756,10/01/2022 04:53,44,0,2,1,1,1,1,1,...,1,1,1,1,1,1,0,,,
3,41846,01/01/2022 13:15,44,5,1,1,1,1,1,1,...,1,1,1,1,1,0,0,,,
4,41847,01/01/2022 19:12,44,150,1,1,1,1,1,1,...,1,1,1,0,0,0,0,,,
5,41867,01/01/2022 15:21,44,4,2,1,1,1,2,1,...,0,1,0,0,0,0,0,,,
6,41868,01/01/2022 20:02,44,14,1,2,1,1,2,1,...,1,1,1,1,1,0,0,,,
7,41920,02/01/2022 12:36,44,0,2,2,1,1,1,2,...,1,1,1,1,1,0,0,,,
8,41937,01/01/2022 00:11,44,0,1,1,1,1,1,1,...,1,1,1,1,1,0,0,,,
9,41951,01/01/2022 00:43,44,5,2,99,1,1,2,2,...,1,1,1,0,1,0,0,,,


In [16]:
# keep columns with less than 45% NA values
newdf_nona = newdata.dropna(axis=1, thresh = len(newdata)*0.55)

In [17]:
# download and review our current data in Excel using filter to check if there is anything wrong 
newdf_nona.to_csv('before_inputting.csv', index = False)
#After viewing newdf_nona in Excel, we find out it has one input that should be considered as NA, and not told how to handle
# the input is 'Don't know'
newdf_nona.loc[:, 'household_children'] = newdf_nona['household_children'].replace("Don't know", np.nan)

In [18]:
# find columns that have NA values that need processing and show NA percentages
na_counts = newdf_nona.isna().mean() * 100
columns_with_na_sorted = na_counts[na_counts > 0].sort_values(ascending=False)
list_of_columns_with_na = list(columns_with_na_sorted.items())
print(list_of_columns_with_na)

[('had_covid_2', 44.47900466562986), ('i2_health', 31.213063763608083), ('Soc1_1', 31.213063763608083), ('Soc1_2', 31.213063763608083), ('Soc1_3', 31.213063763608083), ('future_1', 25.36547433903577), ('future_2', 25.36547433903577), ('i12_health_2', 15.56765163297045), ('i12_health_11', 15.56765163297045), ('i12_health_23', 15.56765163297045), ('vac_boost_1', 10.015552099533437), ('household_children', 0.13996889580093314)]


In [19]:
# get the names for these columns
selected_column_names = columns_with_na_sorted.index.tolist()
print(selected_column_names)

['had_covid_2', 'i2_health', 'Soc1_1', 'Soc1_2', 'Soc1_3', 'future_1', 'future_2', 'i12_health_2', 'i12_health_11', 'i12_health_23', 'vac_boost_1', 'household_children']


In [20]:
# define a function that can replace NA values with random values in the dataset
def NA_filler(selected_column, data):
    for names in selected_column:
        possible_values = data[names].dropna().values
        data.loc[:, names] = data[names].apply(lambda x: np.random.choice(possible_values) if pd.isna(x) else x)
    return

In [21]:
# create a copy of newdf_nona to do our NA value inputting
final_data = newdf_nona
NA_filler(selected_column_names, final_data)

In [22]:
# save our final data so our data is consistent every time, and preventing us from cleaning again
final_data.to_csv('final_data.csv', index = False)

In [23]:
# This is our final cleaned data, now we can read this instead of doing cleaning again
final_cleaned_data = pd.read_csv("final_data.csv", low_memory=False)

In [24]:
# check the data type for all columns
final_cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6430 entries, 0 to 6429
Data columns (total 68 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RecordNo            6430 non-null   int64  
 1   endtime             6430 non-null   object 
 2   qweek               6430 non-null   int64  
 3   i2_health           6430 non-null   int64  
 4   i9_health           6430 non-null   int64  
 5   i11_health          6430 non-null   int64  
 6   i12_health_1        6430 non-null   int64  
 7   i12_health_2        6430 non-null   int64  
 8   i12_health_3        6430 non-null   int64  
 9   i12_health_4        6430 non-null   int64  
 10  i12_health_5        6430 non-null   int64  
 11  i12_health_6        6430 non-null   int64  
 12  i12_health_7        6430 non-null   int64  
 13  i12_health_8        6430 non-null   int64  
 14  i12_health_11       6430 non-null   int64  
 15  i12_health_12       6430 non-null   int64  
 16  i12_he

In [25]:
# make sure the column with strings is correctly recognized
print(final_cleaned_data['region'].unique())

['Quebec / Québec' 'British Columbia / Colombie Britanique' 'Ontario'
 'Manitoba' 'Newfoundland & Labrador / Terre-Neuve-et-Labrador' 'Alberta'
 'Nova Scotia / Nouvelle-Écosse' 'New Brunswick / Nouveau-Brunswick'
 'Saskatchewan' 'Prince Edward Island / Île-du-Prince-Édouard' 'Nunavut'
 'Yukon']


In [26]:
# we looked at the instructions and found out the regions are in UK instead of Canada, so we need to assign valid values
# to Canadian Provinces as well
provinces = final_cleaned_data['region'].unique().tolist()
province_num = list(range(1,len(provinces)+1))
print(provinces)
print(province_num)

['Quebec / Québec', 'British Columbia / Colombie Britanique', 'Ontario', 'Manitoba', 'Newfoundland & Labrador / Terre-Neuve-et-Labrador', 'Alberta', 'Nova Scotia / Nouvelle-Écosse', 'New Brunswick / Nouveau-Brunswick', 'Saskatchewan', 'Prince Edward Island / Île-du-Prince-Édouard', 'Nunavut', 'Yukon']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [27]:
for cell in final_cleaned_data['region']:
    if type(cell) == str: 
        final_cleaned_data.loc[:, 'region'] = final_cleaned_data['region'].replace(cell, provinces.index(cell)+1)

In [28]:
finished = final_cleaned_data
finished.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6430 entries, 0 to 6429
Data columns (total 68 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RecordNo            6430 non-null   int64  
 1   endtime             6430 non-null   object 
 2   qweek               6430 non-null   int64  
 3   i2_health           6430 non-null   int64  
 4   i9_health           6430 non-null   int64  
 5   i11_health          6430 non-null   int64  
 6   i12_health_1        6430 non-null   int64  
 7   i12_health_2        6430 non-null   int64  
 8   i12_health_3        6430 non-null   int64  
 9   i12_health_4        6430 non-null   int64  
 10  i12_health_5        6430 non-null   int64  
 11  i12_health_6        6430 non-null   int64  
 12  i12_health_7        6430 non-null   int64  
 13  i12_health_8        6430 non-null   int64  
 14  i12_health_11       6430 non-null   int64  
 15  i12_health_12       6430 non-null   int64  
 16  i12_he

In [29]:
final_cleaned_data.to_csv('Final Version.csv', index = False)

In [30]:
# Load the library packages we need
from numpy import arange

In [31]:
from pandas import read_table

In [32]:
from pandas import set_option

In [33]:
from pandas.plotting import scatter_matrix

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
from sklearn.model_selection import KFold

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
from sklearn.metrics import mean_squared_error

In [38]:
!pip install dmba

Collecting dmba
  Using cached dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Using cached dmba-0.2.4-py3-none-any.whl (11.8 MB)
Installing collected packages: dmba
Successfully installed dmba-0.2.4


In [39]:
from dmba import classificationSummary

no display found. Using non-interactive Agg backend


In [40]:
# Load the new data set
df = pd.read_csv("Final Version.csv")

In [41]:
# Check the shape of the dataset
df.shape

(6430, 68)

In [42]:
# Data Overview
df.head(10)
df.dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6430 entries, 0 to 6429
Data columns (total 68 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RecordNo            6430 non-null   int64  
 1   endtime             6430 non-null   object 
 2   qweek               6430 non-null   int64  
 3   i2_health           6430 non-null   int64  
 4   i9_health           6430 non-null   int64  
 5   i11_health          6430 non-null   int64  
 6   i12_health_1        6430 non-null   int64  
 7   i12_health_2        6430 non-null   int64  
 8   i12_health_3        6430 non-null   int64  
 9   i12_health_4        6430 non-null   int64  
 10  i12_health_5        6430 non-null   int64  
 11  i12_health_6        6430 non-null   int64  
 12  i12_health_7        6430 non-null   int64  
 13  i12_health_8        6430 non-null   int64  
 14  i12_health_11       6430 non-null   int64  
 15  i12_health_12       6430 non-null   int64  
 16  i12_he

In [43]:
# EDA Part (Exploratory Data Analysis Part) 

In [44]:
df["vac"]

0       3
1       3
2       3
3       3
4       3
       ..
6425    3
6426    2
6427    3
6428    1
6429    3
Name: vac, Length: 6430, dtype: int64

In [45]:
# Summarize the variable "vac"
# This is imbalance data
df["vac"].value_counts()

vac
3    5559
1     644
2     227
Name: count, dtype: int64

In [46]:
# We would like to do the research of "not fully vaccinated" and "fully vaccinated"
# Combine 1 and 2 as 1, and 3 become 2 
df["vac"].replace(2, 1, inplace=True)
df["vac"].replace(3, 2, inplace=True)

In [47]:
# Check whether we do it successfully
df["vac"].value_counts()

vac
2    5559
1     871
Name: count, dtype: int64

In [48]:
# Bar chart of the variable "vac"
df["vac"].value_counts().plot(kind='bar', title= "Vaccination Status")
plt.show()

In [49]:
# Correlation Matrix

In [50]:
# Selecting the numeric predictors to do the correlation matrix
numeric_predictors = df.select_dtypes(include=['number'])
correlation_matrix = numeric_predictors.corr()
print(correlation_matrix)

             RecordNo     qweek  i2_health  i9_health  i11_health  \
RecordNo     1.000000  0.988965   0.009432   0.045936   -0.006632   
qweek        0.988965  1.000000   0.004523   0.046167   -0.006878   
i2_health    0.009432  0.004523   1.000000   0.011435   -0.014942   
i9_health    0.045936  0.046167   0.011435   1.000000    0.302581   
i11_health  -0.006632 -0.006878  -0.014942   0.302581    1.000000   
...               ...       ...        ...        ...         ...   
vac_man_96   0.003205  0.001914  -0.014018  -0.013482   -0.022436   
vac_man_99   0.013016  0.010512   0.026152   0.126670    0.114633   
future_1    -0.018463 -0.014897  -0.023466  -0.039964   -0.017746   
future_2    -0.022845 -0.024682  -0.004177  -0.031215   -0.012886   
had_covid_2 -0.011285 -0.006253  -0.007777  -0.003615    0.008672   

             i12_health_1  i12_health_2  i12_health_3  i12_health_4  \
RecordNo         0.108588      0.022444      0.062368      0.066522   
qweek            0.106525    

In [51]:
# Only show the correlation between "vac" and other variables
response = 'vac'
response_correlation = correlation_matrix[response].sort_values(ascending=False)
pd.set_option('display.max_rows', None) # We want see all the results
print(response_correlation)

vac                   1.000000
vac7                  0.502455
r1_8                  0.494183
r1_9                  0.463613
vac_man_1             0.412204
vac_man_2             0.398646
vac_man_3             0.386429
vac_man_4             0.376152
vac_man_5             0.329552
vac_man_6             0.295085
vac_man_7             0.282001
vac2_2                0.262519
r1_1                  0.205165
age                   0.145334
r1_2                  0.135240
cantril_ladder        0.084822
had_covid_2           0.074300
vac_man_96            0.063557
Soc1_3                0.047878
future_1              0.045794
qweek                 0.015900
future_2              0.014183
RecordNo              0.006290
gender                0.003639
Soc1_2               -0.012778
i2_health            -0.020040
region               -0.026142
PHQ4_2               -0.031159
PHQ4_3               -0.042666
Soc1_1               -0.044117
PHQ4_4               -0.047080
PHQ4_1               -0.049606
weight  

In [52]:
# Selecting predictors
# "r1_8", "vac_man_1","vac_man_2","vac_man_3", "vac_man_4", "vac_man_5", "vac2_7", "vac2_3","vac_man_99"

In [53]:
# Checking correlation between selected predictors and delete some of the predictors
df_predictors = pd.DataFrame(df)
predictors1 = ['vac7','r1_8','r1_9','vac_man_1','vac_man_2', 'vac_man_3',
               'vac_man_4', 'vac_man_5','vac2_7','vac2_3','vac_man_99']
correlation_matrix = df_predictors[predictors1].corr()
print(correlation_matrix)

                vac7      r1_8      r1_9  vac_man_1  vac_man_2  vac_man_3  \
vac7        1.000000  0.689357  0.670946   0.531929   0.520046   0.531531   
r1_8        0.689357  1.000000  0.884899   0.532933   0.520354   0.521998   
r1_9        0.670946  0.884899  1.000000   0.537417   0.521514   0.527144   
vac_man_1   0.531929  0.532933  0.537417   1.000000   0.894126   0.882466   
vac_man_2   0.520046  0.520354  0.521514   0.894126   1.000000   0.895815   
vac_man_3   0.531531  0.521998  0.527144   0.882466   0.895815   1.000000   
vac_man_4   0.505047  0.502823  0.505524   0.828081   0.852646   0.850250   
vac_man_5   0.440845  0.449876  0.456209   0.693500   0.725052   0.734364   
vac2_7     -0.519878 -0.541451 -0.540658  -0.377177  -0.362998  -0.372330   
vac2_3     -0.587895 -0.558886 -0.547642  -0.421461  -0.416587  -0.418764   
vac_man_99 -0.535196 -0.520554 -0.531188  -0.895878  -0.852784  -0.843417   

            vac_man_4  vac_man_5    vac2_7    vac2_3  vac_man_99  
vac7    

In [54]:
# Remaining predictors 
# "r1_8", "vac_man_1", "vac_man_4", "vac_man_5", "vac2_7", "vac2_3"
# Crosstab-Analysis between response variables and predictors

In [55]:
predictors = ['vac7','r1_8','vac_man_1','vac_man_4', 'vac_man_5', 'vac2_7','vac2_3']
target = 'vac'

In [56]:
# Create subplots
n = len(predictors)  # Number of predictors
fig, axes = plt.subplots(n, 1, figsize=(8, 4 * n))  # Adjust the figure size as needed

for i, predictor in enumerate(predictors):
    # Generate the crosstab
    crosstab = pd.crosstab(df[predictor], df[target])
    crosstab.plot(kind='bar', ax=axes[i])  # Accessing axes directly
    axes[i].set_title(f'Crosstab: {predictor} vs {target}')
    axes[i].set_xlabel(predictor)
    axes[i].set_ylabel('Count')
    axes[i].legend(title=target)
plt.tight_layout()
plt.show()

In [57]:
# Create subplots
n = len(predictors)  # Number of predictors
fig, axes = plt.subplots(n, 1, figsize=(8, 4 * n))  # Adjust the figure size as needed

for i, predictor in enumerate(predictors):
    # Generate the crosstab
    crosstab = pd.crosstab(df[predictor], df[target])
    # Normalize
    crosstab_norm = crosstab.div(crosstab.sum(axis=1), axis=0)
    # Plot
    crosstab_norm.plot(kind='bar', stacked = True, ax=axes[i])  # Accessing axes directly
    axes[i].set_title(f'Crosstab: {predictor} vs {target}')
    axes[i].set_xlabel(predictor)
    axes[i].set_ylabel('Count')
    axes[i].legend(title=target)
plt.tight_layout()
plt.show()

In [58]:
# These plots shows the distributions of the predictors.
# The predictors "vac_man_1", "vac_man_4","vac_man_7" are binary dummy variables
# The predictors "vac7", "r1_8" and "vac2_3" are not normal distribution, need normalize
df[predictors].hist(sharex = False, sharey = False, xlabelsize = 1, ylabelsize = 1)

array([[<Axes: title={'center': 'vac7'}>,
        <Axes: title={'center': 'r1_8'}>,
        <Axes: title={'center': 'vac_man_1'}>],
       [<Axes: title={'center': 'vac_man_4'}>,
        <Axes: title={'center': 'vac_man_5'}>,
        <Axes: title={'center': 'vac2_7'}>],
       [<Axes: title={'center': 'vac2_3'}>, <Axes: >, <Axes: >]],
      dtype=object)

In [59]:
# The plots shows same thing as distribution.
# There are no outliers visible in these plots 
# Because we did not see any of the data points extend beyond the whiskers.
df[predictors].plot(kind = 'box', subplots = True, layout =(4,4), 
             sharex = False, sharey = False, fontsize = 8)

vac7            Axes(0.125,0.712609;0.168478x0.167391)
r1_8         Axes(0.327174,0.712609;0.168478x0.167391)
vac_man_1    Axes(0.529348,0.712609;0.168478x0.167391)
vac_man_4    Axes(0.731522,0.712609;0.168478x0.167391)
vac_man_5       Axes(0.125,0.511739;0.168478x0.167391)
vac2_7       Axes(0.327174,0.511739;0.168478x0.167391)
vac2_3       Axes(0.529348,0.511739;0.168478x0.167391)
dtype: object

In [60]:
sns.heatmap(df[predictors].corr(method='pearson'))

<Axes: >

In [61]:
# KNN

In [62]:
array = df.values
array

array([[41589, '06/01/2022 18:02', 44, ..., 2, 2, 2],
       [41731, '03/01/2022 23:48', 44, ..., 3, 3, 2],
       [41756, '10/01/2022 04:53', 44, ..., 3, 1, 2],
       ...,
       [48369, '27/03/2022 15:31', 50, ..., 3, 3, 1],
       [48370, '27/03/2022 16:23', 50, ..., 3, 2, 2],
       [48371, '27/03/2022 16:45', 50, ..., 3, 3, 2]], dtype=object)

In [63]:
# Creating X-array with remaining predictors: "vac2_3","r1_8","vac2_7", "vac_man_1", "vac_man_4", "vac_man_5"

In [64]:
predictors_columns = [48, 50, 52, 56, 59, 60]
X = df.iloc[:,predictors_columns]

In [65]:
y = df.iloc[:,53]

In [66]:
# As we can see, there is imbalance in the dataset.
# We want to make sure there is enough "1" values in y so we use larger test set
test_size = 0.3

In [67]:
seed = 42

In [68]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=test_size,
                                                   random_state = seed, stratify = y)

In [69]:
# Normalize the x_training dataset
scaler = StandardScaler().fit(X_train)
rescaledX_train = scaler.transform(X_train)

In [70]:
# According to the square root of N rule,
k_values = np.arange(1,80)

In [71]:
param_grid = dict(n_neighbors=k_values)

In [72]:
num_folds = 10

In [73]:
scoring = 'accuracy'

In [74]:
model = KNeighborsClassifier()

In [75]:
kfold = KFold(n_splits=num_folds, shuffle = True, random_state = seed)

In [76]:
kfold.split(X_train)

<generator object _BaseKFold.split at 0x7fd273820040>

In [77]:
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring = scoring)

In [78]:
msg = "KNN %f (%f)" % (cv_results.mean(),cv_results.std())
print(msg)

KNN 0.884690 (0.014537)


In [79]:
cv_results

array([0.89578714, 0.87333333, 0.86444444, 0.88222222, 0.88888889,
       0.86666667, 0.9       , 0.88444444, 0.91333333, 0.87777778])

In [80]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring= scoring, cv=kfold)

In [81]:
grid_result = grid.fit(rescaledX_train, y_train)

In [82]:
print("Best:%f using %s" %(grid_result.best_score_,grid_result.best_params_))

Best:0.896911 using {'n_neighbors': 42}


In [83]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [84]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean,stdev,param))

0.849147 (0.020838) with: {'n_neighbors': 1}
0.819597 (0.021159) with: {'n_neighbors': 2}
0.882467 (0.014025) with: {'n_neighbors': 3}
0.871140 (0.014160) with: {'n_neighbors': 4}
0.887133 (0.013010) with: {'n_neighbors': 5}
0.884912 (0.013270) with: {'n_neighbors': 6}
0.887133 (0.013752) with: {'n_neighbors': 7}
0.886022 (0.016827) with: {'n_neighbors': 8}
0.891799 (0.014758) with: {'n_neighbors': 9}
0.889355 (0.015974) with: {'n_neighbors': 10}
0.892687 (0.013431) with: {'n_neighbors': 11}
0.893577 (0.016084) with: {'n_neighbors': 12}
0.893355 (0.016495) with: {'n_neighbors': 13}
0.893577 (0.016565) with: {'n_neighbors': 14}
0.893354 (0.017514) with: {'n_neighbors': 15}
0.893577 (0.015960) with: {'n_neighbors': 16}
0.893799 (0.016247) with: {'n_neighbors': 17}
0.893800 (0.016573) with: {'n_neighbors': 18}
0.894243 (0.016106) with: {'n_neighbors': 19}
0.894244 (0.016937) with: {'n_neighbors': 20}
0.893800 (0.015748) with: {'n_neighbors': 21}
0.894244 (0.015217) with: {'n_neighbors': 2

In [85]:
plt.plot(k_values, means)
plt.xlabel('Number of Neighbors k')
plt.ylabel('Accuracy')

Text(0, 0.5, 'Accuracy')

In [86]:
# Building the model by using n_neighbors=42
model = KNeighborsClassifier(n_neighbors=42)

In [87]:
# Fitting model
model.fit(rescaledX_train, y_train)

In [88]:
# Rescaled for the X_test data for better predictions
rescaledX_test = scaler.transform(X_test)

In [89]:
estimates = model.predict(rescaledX_test)

In [90]:
estimates

array([1, 2, 2, ..., 2, 2, 2])

In [91]:
(y_test,estimates)

(5124    1
 1327    2
 4560    2
 3595    1
 4121    2
 26      1
 3408    2
 3488    1
 3583    2
 2780    1
 739     2
 1520    2
 1274    2
 6237    2
 5148    2
 5244    2
 2896    1
 4294    2
 4611    2
 3188    2
 4855    2
 4922    2
 3761    2
 1612    1
 888     2
 612     2
 3428    2
 4162    2
 2824    2
 1604    2
 6394    2
 5814    2
 1403    2
 1474    2
 2740    1
 1734    2
 5371    2
 1662    2
 934     2
 1444    1
 2538    2
 1486    2
 3890    2
 4540    2
 2334    2
 657     2
 4126    2
 4311    2
 3457    2
 1563    2
 1319    2
 3620    2
 2453    2
 684     1
 454     2
 847     2
 5815    1
 4459    2
 5104    2
 5017    2
 6162    1
 2075    2
 6104    2
 6115    2
 6132    2
 6306    2
 5352    1
 3749    2
 3701    2
 302     2
 3052    2
 6044    2
 1162    2
 4205    2
 3560    1
 2626    2
 5308    2
 5449    2
 4586    2
 191     1
 2097    2
 4669    2
 4752    2
 2236    2
 3800    2
 3557    2
 4163    2
 4979    2
 1954    2
 1664    1
 2977    2

In [92]:
model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 42,
 'p': 2,
 'weights': 'uniform'}

In [93]:
accuracy_score(y_test, estimates)

0.8963193364437533

In [94]:
classificationSummary(y_test, estimates)

Confusion Matrix (Accuracy 0.8963)

       Prediction
Actual    0    1
     0  123  138
     1   62 1606


In [97]:
print("Classification report:")
print(classification_report(y_test, estimates))

Classification report:
              precision    recall  f1-score   support

           1       0.66      0.47      0.55       261
           2       0.92      0.96      0.94      1668

    accuracy                           0.90      1929
   macro avg       0.79      0.72      0.75      1929
weighted avg       0.89      0.90      0.89      1929

