# Importing dependencies

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# If running from collab and your dataset is on your drive

'''
from google.colab import drive
drive.mount('/content/drive')
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

# Data Collection & Processing

In [20]:
# Extracting zip file into a folder named 'data'

import zipfile
import os

zip_file_path = './DPS_data.zip'
extract_to_dir = 'data'

os.makedirs(extract_to_dir, exist_ok=True) # Making req folders

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)


Extracted './DPS_data.zip' to 'data'


In [21]:
#printing the first 5 rows of the dataframe
data = pd.read_csv('./data/your_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,...,redness in or around nose,wrinkles on skin,foot or toe weakness,hand or finger cramps or spasms,back stiffness or tightness,wrist lump or mass,skin pain,low urine output,sore in nose,ankle weakness
0,0,panic disorder,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,panic disorder,0,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,panic disorder,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,panic disorder,1,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,4,panic disorder,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 330 entries, Unnamed: 0 to ankle weakness
dtypes: int64(329), object(1)
memory usage: 621.7+ MB


Since each symptom only consists of 0 and 1 it will be much better to have the data in uint8 (1byte) instead of int64 (8bytes)

In [5]:
binary_columns = data.drop(columns='diseases').columns  # Exclude 'disease' column
data[binary_columns] = data[binary_columns].astype('uint8')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 330 entries, Unnamed: 0 to ankle weakness
dtypes: object(1), uint8(329)
memory usage: 79.4+ MB


Note the drastic decrease in memory usage (Almost 8 times!)

In [6]:
#number of rows and columns
data.shape

(246945, 330)

In [7]:
#number of unique diseases
len(data['diseases'].unique())

773

In [14]:
#check no. of missing values in each column
data.isnull().sum()

Unnamed: 0                 0
diseases                   0
anxiety and nervousness    0
depression                 0
shortness of breath        0
                          ..
wrist lump or mass         0
skin pain                  0
low urine output           0
sore in nose               0
ankle weakness             0
Length: 330, dtype: int64

Our dataset comprises 773 diseases and  329 symptoms, with no missing values. As a result, there is no need for any missing value handling procedures.

# Spliting into training & test data

Seperating feature & target

In [15]:
X = data.drop(columns='diseases', axis=1)
y = data['diseases']

### Encoding the Categorical columns   
(not requried you can have the labels inputted directly)

In [16]:
# Y = LabelEncoder().fit_transform(y)

Spliting the data into training data & test data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Shape of training data & test data

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(246945, 329) (197556, 329) (49389, 329)


In [19]:
print(y.shape, y_train.shape, y_test.shape)

(246945,) (197556,) (49389,)
