# Schizophrenia

---

## Modules and Setup

In [3]:
import os as os
import numpy as np
import pandas.api.types as ptypes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import statsmodels.api as sm
from IPython.display import Markdown, display
from statsmodels.stats.outliers_influence import variance_inflation_factor
from ucimlrepo import fetch_ucirepo 


In [2]:
pd.set_option('display.float_format', lambda x: '%.9f' % x)

In [None]:
previous_dir = None # Failsafe if directory does not start in notebook directory 
while not os.path.exists('README.md'):
    current_dir = os.getcwd()

    if current_dir == previous_dir:
        print('Warning: Project root not found. Project currently at', current_dir)
        break

    previous_dir = current_dir
    os.chdir(os.path.abspath(os.path.join(os.getcwd(), '..')))

---

# 1. Project Overview

---

# 2. Objectives

---

# 3. Dataset

## 3.1. Dataset Features



| Feature Name              | Type        | Description                                      | Values (if categorical)                                         |
|---------------------------|------------|--------------------------------------------------|-----------------------------------------------------------------|
| Patient_ID               | Numerical  | Unique identifier assigned to each patient      | Unique ID                                                      |
| Age                      | Numerical  | Patient's age                                   | 18-80                                                          |
| Gender                   | Categorical| Gender of the patient                           | 0: Female, 1: Male                                             |
| Education_Level          | Categorical| Highest level of education completed           | 1: Primary, 2: Middle, 3: High School, 4: University, 5: Postgraduate |
| Marital_Status          | Categorical| Marital status of the patient                  | 0: Single, 1: Married, 2: Divorced, 3: Widowed                 |
| Occupation              | Categorical| Employment status of the patient               | 0: Unemployed, 1: Employed, 2: Retired, 3: Student             |
| Income_Level            | Categorical| Income level category                          | 0: Low, 1: Medium, 2: High                                     |
| Living_Area             | Categorical| Living area classification                     | 0: Rural, 1: Urban                                            |
| Diagnosis               | Categorical| Schizophrenia diagnosis status                 | 0: Not schizophrenic, 1: Schizophrenic                         |
| Disease_Duration        | Numerical  | Duration of illness (years)                    | 1-40                                                           |
| Hospitalizations        | Numerical  | Number of hospital admissions                  | 0-10                                                           |
| Family_History         | Categorical| Family history of schizophrenia                | 0: No, 1: Yes                                                 |
| Substance_Use          | Categorical| Substance use history (tobacco, alcohol, drugs)| 0: No, 1: Yes                                                 |
| Suicide_Attempt        | Categorical| History of suicide attempts                    | 0: No, 1: Yes                                                 |
| Positive_Symptom_Score | Numerical  | Positive symptom severity                      | 0-100                                                          |
| Negative_Symptom_Score | Numerical  | Negative symptom severity                      | 0-100                                                          |
| GAF_Score              | Numerical  | Global assessment of functioning               | 0-100                                                          |
| Social_Support         | Categorical| Level of social support                        | 0: Low, 1: Medium, 2: High                                     |
| Stress_Factors         | Categorical| Level of stress factors                        | 0: Low, 1: Medium, 2: High                                     |
| Medication_Adherence   | Categorical| Adherence to medication regimen                | 0: Poor, 1: Moderate, 2: Good                                  |


---

# 4. Workflow

## 4.1. Loading and Tidying the Data

In [37]:
raw_data_file = os.path.abspath(os.path.join(os.getcwd(), 'data', 'raw', 'schizophrenia_dataset.csv')) 
if os.path.exists(raw_data_file): 
    df = pd.read_csv(raw_data_file)
else:
    print('Data file is missing:', raw_data_file) # Debugging for missing file

In [38]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Education_Level,Maritial_Status,Occupation,Income_Level,Living_Area,Diagnosis,Disease_Duration,Hospitalizations,Family_History,Substance_use,Suicide_Attempt,Positive_Symptom_Score,Negative_Symptom_Score,GAF_Score,Social_Support,Stress_factors,Medication_Adherence
0,1,72,1,4,2,0,2,1,0,0,0,0,0,0,32,48,72,0,2,2
1,2,49,1,5,2,2,1,0,1,35,1,1,1,1,51,63,40,2,2,0
2,3,53,1,5,3,2,1,0,1,32,0,1,0,0,72,85,51,0,1,1
3,4,67,1,3,2,0,2,0,0,0,0,0,1,0,10,21,74,1,1,2
4,5,54,0,1,2,0,2,1,0,0,0,0,0,0,4,27,98,0,1,0


In [39]:
df.tail()

Unnamed: 0,Patient_ID,Age,Gender,Education_Level,Maritial_Status,Occupation,Income_Level,Living_Area,Diagnosis,Disease_Duration,Hospitalizations,Family_History,Substance_use,Suicide_Attempt,Positive_Symptom_Score,Negative_Symptom_Score,GAF_Score,Social_Support,Stress_factors,Medication_Adherence
9995,9996,72,0,2,3,3,0,1,1,18,1,1,0,1,84,74,54,1,1,1
9996,9997,27,0,3,3,3,0,0,1,6,6,0,0,0,99,50,53,2,2,0
9997,9998,31,1,3,2,0,0,0,1,29,9,1,1,0,65,65,60,1,1,1
9998,9999,56,1,4,1,1,2,1,0,0,0,0,0,0,38,10,74,0,1,2
9999,10000,53,0,1,3,1,0,1,0,0,0,0,0,0,24,21,87,1,2,0


In [43]:
df.isna().sum(axis = 0)

Patient_ID                0
Age                       0
Gender                    0
Education_Level           0
Maritial_Status           0
Occupation                0
Income_Level              0
Living_Area               0
Diagnosis                 0
Disease_Duration          0
Hospitalizations          0
Family_History            0
Substance_use             0
Suicide_Attempt           0
Positive_Symptom_Score    0
Negative_Symptom_Score    0
GAF_Score                 0
Social_Support            0
Stress_factors            0
Medication_Adherence      0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Patient_ID              10000 non-null  int64
 1   Age                     10000 non-null  int64
 2   Gender                  10000 non-null  int64
 3   Education_Level         10000 non-null  int64
 4   Maritial_Status         10000 non-null  int64
 5   Occupation              10000 non-null  int64
 6   Income_Level            10000 non-null  int64
 7   Living_Area             10000 non-null  int64
 8   Diagnosis               10000 non-null  int64
 9   Disease_Duration        10000 non-null  int64
 10  Hospitalizations        10000 non-null  int64
 11  Family_History          10000 non-null  int64
 12  Substance_use           10000 non-null  int64
 13  Suicide_Attempt         10000 non-null  int64
 14  Positive_Symptom_Score  10000 non-null  int64
 15  Negative_Symptom_Sco

### 4.1.1 Decoding the Categorical Variables

In [None]:
col_dict = {'Gender': {0: 'Female', 1: 'Male'},
            'Education_Level': {1: 'Primary', 2: 'Middle School', 3: 'High School', 4: 'University', 5: 'Postgraduate'},
            'Maritial_Status': {0: 'Single', 1: 'Married', 2: 'Divorced', 3: 'Widowed'},
            'Occupation': {0: 'Unemployed', 1: 'Employed', 2: 'Retired', 3: 'Student'},
            'Income_Level': {0: 'Low', 1: 'Medium', 2: 'High'},
            'Living_Area': {0: 'Rural', 1: 'Urban'},
            'Diagnosis': {0: 'Not Schizophrenic', 1: 'Schizophrenic'},
            'Family_History': {0: 'No', 1: 'Yes'},
            'Substance_use': {0: 'No', 1: 'Yes'},
            'Suicide_Attempt': {0: 'No', 1: 'Yes'},
            'Social_Support': {0: 'Low', 1: 'Medium', 2: 'High'},
            'Stress_factors': {0: 'Low', 1: 'Medium', 2: 'High'},
            'Medication_Adherence': {0: 'Poor', 1: 'Moderate', 2: 'Good'}}

## 4.2. Exploratory Data Analysis (EDA)

### 4.2.1. Overview of the Dataset

### 4.2.2. Target Variable Analysis

### 4.2.3. Univariate Analysis

#### 4.2.3.1. Numerical Variables

#### 4.2.3.2. Categorical Variables

### 4.2.4. Bivariate Analysis

### 4.2.5. Feature Relationships

---

# 5. Data Preprocessing

## 5.1. Spliting the Data into testing and training

## 5.2. Feature Scaling

---

# 6. Model Development

---

# 7. Model Evaluation

---

# 8. Results

---

# 9. Conclusions

---