In [1]:
# ---------------------
# 2. SYMPTOMS DATASET
# ---------------------
# Select records where diagnosis equals "Stroke", and include:
# diagnosis, bp_levels, cholesterol_levels, symptoms, stroke_history, smoking_status, alcohol_intake
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv('/content/stroke_prediction_dataset.csv')

symptoms_df = df[df['diagnosis'] == 'Stroke'][[
    'diagnosis', 'bp_levels', 'cholesterol_levels', 'symptoms',
    'stroke_history', 'smoking_status', 'alcohol_intake'
]].copy()

# --- Process BP Levels ---
# Example value: "140/108" is split into systolic and diastolic blood pressure.
bp_split = symptoms_df['bp_levels'].str.extract(r'(?P<systolic_bp>\d+)/(?P<diastolic_bp>\d+)')
symptoms_df['systolic_bp'] = pd.to_numeric(bp_split['systolic_bp'], errors='coerce')
symptoms_df['diastolic_bp'] = pd.to_numeric(bp_split['diastolic_bp'], errors='coerce')

# --- Process Cholesterol Levels ---
# Extract HDL and LDL values.
# Example string: "HDL: 68, LDL: 133"
chol_split = symptoms_df['cholesterol_levels'].str.extract(r'HDL:\s*(?P<hdl>\d+),\s*LDL:\s*(?P<ldl>\d+)')
symptoms_df['hdl'] = pd.to_numeric(chol_split['hdl'], errors='coerce')
symptoms_df['ldl'] = pd.to_numeric(chol_split['ldl'], errors='coerce')

# If there are missing values, imputation will be done later
# so we now define our numeric features for symptoms, including bp and cholesterol parts.
symptom_num_features = ['systolic_bp', 'diastolic_bp', 'hdl', 'ldl']

# --- Process 'symptoms' Column ---
# Split the symptoms text into list of symptoms
symptoms_df['symptoms'] = symptoms_df['symptoms'].fillna('').astype(str).str.split(',\s*')
mlb = MultiLabelBinarizer()
symptom_matrix = mlb.fit_transform(symptoms_df['symptoms'])
symptom_keywords_df = pd.DataFrame(symptom_matrix, columns=mlb.classes_, index=symptoms_df.index)

# --- Process Categorical Features in Symptoms Dataset ---
# Here we include smoking_status and alcohol_intake as categorical.
symptom_cat_features = ['smoking_status', 'alcohol_intake']

# Build pipelines for symptoms dataset numeric and categorical features.
symptom_num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
symptom_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
symptom_preprocessor = ColumnTransformer([
    ('num', symptom_num_pipeline, symptom_num_features),
    ('cat', symptom_cat_pipeline, symptom_cat_features)
])

# Fit and transform the symptoms dataset features
X_symptoms = symptoms_df[symptom_num_features + symptom_cat_features]
X_symptoms_processed = symptom_preprocessor.fit_transform(X_symptoms)

# Get encoded column names for symptoms categorical features
encoded_symptom_cat_cols = symptom_preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(symptom_cat_features)
final_columns = symptom_num_features + list(encoded_symptom_cat_cols)

processed_symptoms_df = pd.DataFrame(
    X_symptoms_processed,
    columns=final_columns,
    index=symptoms_df.index
)

# Add diagnosis and stroke_history to the processed DataFrame
processed_symptoms_df['diagnosis'] = symptoms_df['diagnosis'].values
processed_symptoms_df['stroke_history'] = symptoms_df['stroke_history'].values

# Concatenate with the binarized symptoms columns (extracted from the 'symptoms' text)
processed_symptoms_df = pd.concat([processed_symptoms_df, symptom_keywords_df], axis=1)

# Save the symptoms dataset to CSV
processed_symptoms_df.to_csv("symptoms_dataset.csv", index=False)

print(" Dataset have been processed and saved successfully!")

 Dataset have been processed and saved successfully!


In [2]:
import pandas as pd

# Load your normalized dataset
df = pd.read_csv("symptoms_dataset.csv")

# Fill missing values if any
df.fillna(0, inplace=True)

# Define a function to classify stroke type
def classify_stroke(row):
    hemorrhage_score = (
        (row['systolic_bp'] > 1.0) * 2 +
        (row['diastolic_bp'] > 1.0) * 2 +
        row['Confusion'] * 1 +
        row['Seizures'] * 2 +
        row['Severe Fatigue'] * 1 +
        row['Headache'] * 1
    )

    ischemic_score = (
        (row['hdl'] < 0.5) * 1 +
        (row['ldl'] > 0.5) * 2 +
        row['smoking_status_Formerly Smoked'] * 1 +
        row['smoking_status_Non-smoker'] * 1 +
        row['alcohol_intake_Social Drinker'] * 1 +
        row['Blurred Vision'] * 1 +
        row['Numbness'] * 1 +
        row['Dizziness'] * 1 +
        row['Loss of Balance'] * 1 +
        row['Difficulty Speaking'] * 2
    )

    # Classification threshold logic
    if hemorrhage_score >= 5 and hemorrhage_score > ischemic_score:
        return 0  # Hemorrhagic
    else:
        return 1  # Ischemic

#df.fillna(0, inplace=True)

# Apply classification
df["stroke_type"] = df.apply(classify_stroke, axis=1)

# Save the updated dataset
df.to_csv("classified_stroke_data.csv", index=False)

print("✅ Done! Stroke type (0 = Hemorrhagic, 1 = Ischemic) added to 'classified_stroke_data.csv'")


✅ Done! Stroke type (0 = Hemorrhagic, 1 = Ischemic) added to 'classified_stroke_data.csv'


In [3]:

df = pd.read_csv("classified_stroke_data.csv")
print(df['stroke_type'].value_counts())

stroke_type
1    7057
0     411
Name: count, dtype: int64


In [4]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.stroke_type == 1]
df_minority = df[df.stroke_type == 0]

# Upsample minority
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine back to get a balanced dataset
df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [5]:
# Count number of samples in each class
counts = df['stroke_type'].value_counts()

# Display the count and ratio
print("Class Distribution:")
print(counts)
print("\nClass Ratio:")
print(counts / counts.sum())

Class Distribution:
stroke_type
1    7057
0     411
Name: count, dtype: int64

Class Ratio:
stroke_type
1    0.944965
0    0.055035
Name: count, dtype: float64


In [6]:
import pandas as pd
from sklearn.utils import resample

# Load your dataset
df = pd.read_csv("classified_stroke_data.csv")

# Separate majority and minority classes
df_majority = df[df.stroke_type == 1]
df_minority = df[df.stroke_type == 0]

# Upsample the minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,                # Sample with replacement
    n_samples=len(df_majority), # Match number of majority class
    random_state=42             # Reproducibility
)

# Combine the classes
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
df_balanced.to_csv("balanced_stroke_data.csv", index=False)


In [7]:
bdf=pd.read_csv("balanced_stroke_data.csv")
print(bdf['stroke_type'].value_counts())

stroke_type
0    7057
1    7057
Name: count, dtype: int64


In [8]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib


# Select only symptom-based features
symptoms = [
    'Blurred Vision', 'Confusion', 'Difficulty Speaking', 'Dizziness',
    'Headache', 'Loss of Balance', 'Numbness', 'Seizures',
    'Severe Fatigue', 'Weakness'
]

X = bdf[symptoms]
y = bdf['stroke_type']

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train J48-like Decision Tree
model = DecisionTreeClassifier(criterion="entropy", random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display decision rules
print("\nDecision Tree Rules:\n")
print(export_text(model, feature_names=symptoms))

model.fit(X_train, y_train)


Accuracy: 0.8590152320226709
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      1390
           1       0.88      0.83      0.86      1433

    accuracy                           0.86      2823
   macro avg       0.86      0.86      0.86      2823
weighted avg       0.86      0.86      0.86      2823


Decision Tree Rules:

|--- Seizures <= 0.50
|   |--- Headache <= 0.50
|   |   |--- Severe Fatigue <= 0.50
|   |   |   |--- Confusion <= 0.50
|   |   |   |   |--- class: 1
|   |   |   |--- Confusion >  0.50
|   |   |   |   |--- Difficulty Speaking <= 0.50
|   |   |   |   |   |--- Blurred Vision <= 0.50
|   |   |   |   |   |   |--- Numbness <= 0.50
|   |   |   |   |   |   |   |--- Dizziness <= 0.50
|   |   |   |   |   |   |   |   |--- Loss of Balance <= 0.50
|   |   |   |   |   |   |   |   |   |--- Weakness <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- Weakn

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8590152320226709
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      1390
           1       0.88      0.83      0.86      1433

    accuracy                           0.86      2823
   macro avg       0.86      0.86      0.86      2823
weighted avg       0.86      0.86      0.86      2823



In [10]:

# Save the trained model
joblib.dump(model, 'stroke_type_model.pkl')

# Save the symptom feature list (optional but useful)
joblib.dump(symptoms, 'stroke_symptom_features.pkl')


['stroke_symptom_features.pkl']

In [14]:
import joblib

# Load the model and features
model = joblib.load('stroke_type_model.pkl')
symptoms = joblib.load('stroke_symptom_features.pkl')

# Example usage
user_input = [[0, 1, 0, 0, 1, 0, 0, 1, 1, 0]]  # Replace with real answers
result = model.predict(user_input)
print("Stroke Type:", "Ischemic" if result[0] == 1 else "Hemorrhagic")


Stroke Type: Hemorrhagic




In [15]:
user_input = [[1, 1, 1, 1, 0, 1, 1, 0, 0, 1]]  # Replace with real answers
result = model.predict(user_input)
print("Stroke Type:", "Ischemic" if result[0] == 1 else "Hemorrhagic")


Stroke Type: Ischemic


