# Depression Detection using Naive Bayes Theorem

## Adding Dependency

In [32]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as py
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import seaborn as sns

## DataSet Collection & Processing

In [2]:
df_test=pd.read_csv('test.csv', encoding = "ISO-8859-1") 
df_train=pd.read_csv('train.csv', encoding = "ISO-8859-1") 

In [3]:
df_train.head()

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week
0,926,91,23-Nov-61,1,28.0,1,4,6,10,0,...,0,0.0,0,0.0,0.0,1,0.0,0,0,5
1,747,57,24-Oct-61,1,23.0,1,3,5,8,0,...,0,0.0,1,4.804611,4.804611,0,0.0,0,1,3
2,1190,115,05-Oct-61,1,22.0,1,3,5,9,0,...,0,0.0,0,8.007685,8.007685,1,0.0,0,0,5
3,1065,97,23-Sep-61,1,27.0,1,2,4,10,2,...,0,0.0,0,0.0,0.0,1,1.249199,0,0,0
4,806,42,12-Sep-61,0,59.0,0,4,6,10,4,...,0,0.0,0,0.0,0.0,0,0.0,0,0,3


In [4]:
df_train.shape

(1143, 75)

In [5]:
df_train.keys()

Index(['surveyid', 'village', 'survey_date', 'femaleres', 'age', 'married',
       'children', 'hhsize', 'edu', 'hh_children', 'hh_totalmembers',
       'cons_nondurable', 'asset_livestock', 'asset_durable', 'asset_phone',
       'asset_savings', 'asset_land_owned_total', 'asset_niceroof',
       'cons_allfood', 'cons_ownfood', 'cons_alcohol', 'cons_tobacco',
       'cons_med_total', 'cons_med_children', 'cons_ed', 'cons_social',
       'cons_other', 'ent_wagelabor', 'ent_ownfarm', 'ent_business',
       'ent_nonagbusiness', 'ent_employees', 'ent_nonag_revenue',
       'ent_nonag_flowcost', 'ent_farmrevenue', 'ent_farmexpenses',
       'ent_animalstockrev', 'ent_total_cost', 'fs_adskipm_often',
       'fs_adwholed_often', 'fs_chskipm_often', 'fs_chwholed_often', 'fs_meat',
       'fs_enoughtom', 'fs_sleephun', 'med_expenses_hh_ep',
       'med_expenses_sp_ep', 'med_expenses_child_ep',
       'med_portion_sickinjured', 'med_port_sick_child', 'med_afford_port',
       'med_sickdays_hhave

In [6]:
df_train.isnull().sum()

surveyid              0
village               0
survey_date           0
femaleres             0
age                   0
                     ..
saved_mpesa           0
amount_saved_mpesa    0
early_survey          0
depressed             0
day_of_week           0
Length: 75, dtype: int64

In [7]:
df_train.dropna(inplace=True)

In [8]:
df_train['depressed'].value_counts()

0    10
1     2
Name: depressed, dtype: int64

In [9]:
df_train.groupby('depressed').mean()

  df_train.groupby('depressed').mean()


Unnamed: 0_level_0,surveyid,village,femaleres,age,married,children,hhsize,edu,hh_children,hh_totalmembers,...,nondurable_investment,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,day_of_week
depressed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,727.2,100.4,1.0,31.2,1.0,3.6,6.1,8.5,3.6,6.1,...,18.865438,0.0,0.0,0.0,0.0,0.0,0.2,0.080077,0.1,2.6
1,556.0,112.5,0.5,34.5,1.0,5.5,7.5,7.5,5.5,7.5,...,41.809013,0.0,2.402306,0.5,8.007684,5.605379,0.0,0.0,0.0,2.0


In [10]:
df_test.isnull().sum()

surveyid                0
village                 0
survey_date             0
femaleres               0
age                     1
                     ... 
saved_mpesa             0
amount_saved_mpesa      0
early_survey            0
depressed             282
day_of_week             0
Length: 75, dtype: int64

In [11]:
df_test.dropna(inplace=True)

In [12]:
df_test.head(10)

Unnamed: 0,surveyid,village,survey_date,femaleres,age,married,children,hhsize,edu,hh_children,...,given_mpesa,amount_given_mpesa,received_mpesa,amount_received_mpesa,net_mpesa,saved_mpesa,amount_saved_mpesa,early_survey,depressed,day_of_week


In [13]:
df_meta=pd.read_csv('13100607_MetaData.csv')

In [14]:
df_meta.head()

Unnamed: 0,Cube Title,Product Id,CANSIM Id,URL,Cube Notes,Archive Status,Frequency,Start Reference Period,End Reference Period,Total number of dimensions
0,"Probability of depression, by age group and se...",13100607,105-0205,https://www150.statcan.gc.ca/t1/tbl1/en/tv.act...,1;2;3;4;8;9;10;15,ARCHIVED - a cube publicly available but no l...,Every 2 years,2003-01-01,2003-01-01,5.0
1,Dimension ID,Dimension name,Dimension Notes,Dimension Correction Notes,Dimension Definitions,,,,,
2,1,Geography,4;8;9;10,,,,,,,
3,2,Age group,,,,,,,,
4,3,Sex,,,,,,,,


In [15]:
df_meta.shape

(144, 10)

In [16]:
df_meta.keys()

Index(['Cube Title', 'Product Id', 'CANSIM Id', 'URL', 'Cube Notes',
       'Archive Status', 'Frequency', 'Start Reference Period',
       'End Reference Period', 'Total number of dimensions'],
      dtype='object')

In [17]:
df_meta.isnull().sum()

Cube Title                      0
Product Id                      1
CANSIM Id                      67
URL                            42
Cube Notes                     63
Archive Status                142
Frequency                     137
Start Reference Period        142
End Reference Period          142
Total number of dimensions    142
dtype: int64

In [18]:
df0=pd.read_csv('13100607.csv')

In [19]:
df0.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Age group,Sex,Probability of depression,Characteristics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2003,Newfoundland and Labrador,10,"Total, 12 years and over",Both sexes,Total population for the variable probability ...,Number of persons,Persons,249,units,0,v26656800,2.1.1.1.1,460130.0,,,,0
1,2003,Newfoundland and Labrador,10,"Total, 12 years and over",Both sexes,Total population for the variable probability ...,"Low 95% confidence interval, number of persons",Persons,249,units,0,v26656801,2.1.1.1.2,460130.0,,,,0
2,2003,Newfoundland and Labrador,10,"Total, 12 years and over",Both sexes,Total population for the variable probability ...,"High 95% confidence interval, number of persons",Persons,249,units,0,v26656802,2.1.1.1.3,460130.0,,,,0
3,2003,Newfoundland and Labrador,10,"Total, 12 years and over",Both sexes,Total population for the variable probability ...,Coefficient of variation for number of persons,Persons,249,units,0,v26656803,2.1.1.1.4,0.0,,,,1
4,2003,Newfoundland and Labrador,10,"Total, 12 years and over",Both sexes,Total population for the variable probability ...,Percent,Percent,239,units,0,v26656804,2.1.1.1.5,100.0,,,,1


In [20]:
df0.shape

(90048, 18)

In [21]:
df0.isnull().sum()

REF_DATE                         0
GEO                              0
DGUID                            0
Age group                        0
Sex                              0
Probability of depression        0
Characteristics                  0
UOM                              0
UOM_ID                           0
SCALAR_FACTOR                    0
SCALAR_ID                        0
VECTOR                           0
COORDINATE                       0
VALUE                        36044
STATUS                       42872
SYMBOL                       90048
TERMINATED                   90048
DECIMALS                         0
dtype: int64

In [22]:
df0.keys()

Index(['REF_DATE', 'GEO', 'DGUID', 'Age group', 'Sex',
       'Probability of depression', 'Characteristics', 'UOM', 'UOM_ID',
       'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS',
       'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')

## Separating the features and target

In [28]:
x=df_train.drop(columns='depressed',axis=1)
y=df_train['depressed']

In [29]:
print(x)

      surveyid  village survey_date  femaleres   age  married  children  \
22         137        9   21-Jul-60          1  34.0        1         3   
43         122       22   16-Nov-61          1  18.0        1         2   
74         277       57   14-Nov-61          1  30.0        1         6   
127       1372      228   23-Oct-61          1  27.0        1         4   
269        653       42   03-Oct-61          1  33.0        1         2   
446       1327      201   30-Oct-61          1  35.0        1         6   
625        227       48   14-Nov-61          1  22.0        1         4   
761       1137      106   18-Sep-61          1  47.0        1         4   
1025       799      133   22-Oct-61          1  49.0        1         5   
1039       654       42   16-Oct-61          1  27.0        1         3   
1070       844      173   12-Nov-61          1  20.0        1         3   
1081       835      168   12-Nov-61          0  39.0        1         5   

      hhsize  edu  hh_ch

In [42]:
print(y)

22      0
43      0
74      1
127     0
269     0
446     0
625     0
761     0
1025    0
1039    0
1070    0
1081    1
Name: depressed, dtype: int64


## Spilting the data into training data and testing data

In [44]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=0)

In [45]:
print(x.shape,X_test.shape,X_train.shape)

(12, 74) (6, 74) (6, 74)


In [46]:
print(y_train)

43      0
761     0
1039    0
127     0
22      0
446     0
Name: depressed, dtype: int64


## Model Training

## Naive Bayes Theorem

In [36]:
model = GaussianNB() 

In [41]:
 y_pred = model.fit(x_train, y_train).predict(X_test)

ValueError: could not convert string to float: '18-Sep-61'