# **Stroke Survival Prediction**
## Will the patient live or die within 6 months?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Data Explorating
 1. using statical methods.
 2. checking the correlated features.


In [3]:
# Importing the data
stroke_trials_df = pd.read_csv("/V2_International_Stroke_Trials.xlsx - 1745-6215-12-101-S1 - Sheet1 (1).csv")
stroke_trials_df.head()

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DEAD8,H14,ISC14,NK14,STRK14,HTI14,PE14,DVT14,TRAN14,NCB14
0,M,69,Y,C,Y,Y,C,140,D,N,...,0,0,0,0,0,0,0,0,0,0
1,M,76,Y,C,Y,N,C,150,F,Y,...,0,0,0,0,0,0,0,0,0,0
2,F,71,N,C,Y,N,C,170,F,Y,...,0,0,0,0,0,0,0,0,0,0
3,M,81,N,C,N,N,C,170,F,N,...,0,0,0,0,0,0,0,0,0,0
4,M,78,N,C,N,N,C,170,F,Y,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Printing columns names
print(stroke_trials_df.columns)

Index(['SEX', 'AGE', 'RSLEEP', 'RATRIAL', 'RCT', 'RVISINF', 'RHEP24', 'RSBP',
       'RCONSC', 'RDEF1', 'RDEF2', 'RDEF3', 'RDEF4', 'RDEF5', 'RDEF6', 'RDEF7',
       'RDEF8', 'STYPE', 'RDATE', 'RXASP', 'RXHEP', 'DASP14', 'DASPLT',
       'DLH14', 'DMH14', 'DHH14', 'ONDRUG', 'DSCH', 'DIVH', 'DAP', 'DOAC',
       'DGORM', 'DSTER', 'DCAA', 'DCAREND', 'DTHROMB', 'DMAJNCH', 'DDIAGISC',
       'DDIAGHA', 'DDIAGUN', 'DNOSTRK', 'DRSISC', 'DRSH', 'DRSUNK', 'DPE',
       'DALIVE', 'DPLACE', 'DDEAD', 'DDEADC', 'FPLACE', 'FAP', 'FOAC',
       'COUNTRY', 'CNTRYNUM', 'EXPDD', 'EXPD6', 'EXPD14', 'SET14D', 'ID14',
       'OCCODE', 'DEAD1', 'DEAD2', 'DEAD3', 'DEAD4', 'DEAD5', 'DEAD6', 'DEAD7',
       'DEAD8', 'H14', 'ISC14', 'NK14', 'STRK14', 'HTI14', 'PE14', 'DVT14',
       'TRAN14', 'NCB14'],
      dtype='object')


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Statical summary of the dataset using describe function.
stroke_trials_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DEAD8,H14,ISC14,NK14,STRK14,HTI14,PE14,DVT14,TRAN14,NCB14
count,19435,19435.0,19435,19435,19435,19435,19435,19435.0,19435,19435,...,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,3,2,2,3,,3,3,...,,,,,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,,,,,,,,,,
freq,10407,,13750,15282,13024,13020,18655,,14921,14099,...,,,,,,,,,,
mean,,71.71541,,,,,,160.159197,,,...,0.011834,0.008284,0.020324,0.013378,0.041523,0.003036,0.006895,0.001132,0.008541,0.027322
std,,11.619714,,,,,,27.610382,,,...,0.108143,0.090641,0.14111,0.11489,0.199502,0.055015,0.08275,0.033627,0.092026,0.163024
min,,16.0,,,,,,70.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,65.0,,,,,,140.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,73.0,,,,,,160.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,80.0,,,,,,180.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data cleaning
 1. Handel missing values.
 2. Handle dublicates rows.
 3. droping columns.

In [7]:
# Find the number of na values in each column
for col in stroke_trials_df.columns:
    if stroke_trials_df[col].isna().sum() > 0:
        print(f'{col}: {stroke_trials_df[col].isna().sum()}')

In [8]:
# Fill na values in DDEADC and FDEADC column with 0 and in DRSUNK, DPLACE, FPLACE, FAP, FOAC columns with 'U'
stroke_trials_df['DDEADC'] = stroke_trials_df['DDEADC'].fillna(0)
stroke_trials_df['DASP14'] = stroke_trials_df['DASP14'].fillna('U')
stroke_trials_df['DASPLT'] = stroke_trials_df['DASPLT'].fillna('U')
stroke_trials_df['DRSUNK'] = stroke_trials_df['DRSUNK'].fillna('U')
stroke_trials_df['DPLACE'] = stroke_trials_df['DPLACE'].fillna('U')
stroke_trials_df['FPLACE'] = stroke_trials_df['FPLACE'].fillna('U')
stroke_trials_df['DMH14'] = stroke_trials_df['DMH14'].fillna('U')
stroke_trials_df['DHH14'] = stroke_trials_df['DHH14'].fillna('U')
stroke_trials_df['DLH14'] = stroke_trials_df['DLH14'].fillna('U')
stroke_trials_df['DSCH'] = stroke_trials_df['DSCH'].fillna('U')
stroke_trials_df['DIVH'] = stroke_trials_df['DIVH'].fillna('U')
stroke_trials_df['DCAREND'] = stroke_trials_df['DCAREND'].fillna('U')
stroke_trials_df['DTHROMB'] = stroke_trials_df['DTHROMB'].fillna('U')
stroke_trials_df['DALIVE'] = stroke_trials_df['DALIVE'].fillna('U')
stroke_trials_df['FAP'] = stroke_trials_df['FAP'].fillna('U')
stroke_trials_df['FOAC'] = stroke_trials_df['FOAC'].fillna('U')
stroke_trials_df.describe(include='all')

Unnamed: 0,SEX,AGE,RSLEEP,RATRIAL,RCT,RVISINF,RHEP24,RSBP,RCONSC,RDEF1,...,DEAD8,H14,ISC14,NK14,STRK14,HTI14,PE14,DVT14,TRAN14,NCB14
count,19435,19435.0,19435,19435,19435,19435,19435,19435.0,19435,19435,...,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0,19435.0
unique,2,,2,3,2,2,3,,3,3,...,,,,,,,,,,
top,M,,N,N,Y,N,N,,F,Y,...,,,,,,,,,,
freq,10407,,13750,15282,13024,13020,18655,,14921,14099,...,,,,,,,,,,
mean,,71.71541,,,,,,160.159197,,,...,0.011834,0.008284,0.020324,0.013378,0.041523,0.003036,0.006895,0.001132,0.008541,0.027322
std,,11.619714,,,,,,27.610382,,,...,0.108143,0.090641,0.14111,0.11489,0.199502,0.055015,0.08275,0.033627,0.092026,0.163024
min,,16.0,,,,,,70.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,65.0,,,,,,140.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,73.0,,,,,,160.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,80.0,,,,,,180.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Data Preprocessing
1. Encode categorical variable.
2. scale Numerical features.

# Data spliting

# Model training and testing
  

1. Choosing the right Machine Learning Algorithm (supervised or unsupervised)
2. Defining the model.
3. Training the model.
4. making prediction
5. Calculate Performance Metrics:
6. plot the result





# Model validation
Using metrics (Confusion Matrix)