In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
data = pd.read_csv('./src/data.csv')

# Convert categorical variables to one-hot encoding
data = pd.get_dummies(data, columns=[
                      'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])

# Display basic information about the dataset
print(data.info())

# Calculate skewness and kurtosis for numerical variables
numerical_vars = ['age', 'avg_glucose_level', 'bmi']
skewness = data[numerical_vars].apply(skew)
kurtosis_value = data[numerical_vars].apply(kurtosis)

print("\nSkewness:")
print(skewness)
print("\nKurtosis:")
print(kurtosis_value)

# Compute the correlation matrix
correlation_matrix = data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.tight_layout(pad=5)  # Adjust layout
plt.show()

# Perform regression analysis
# Define independent variables (features) and the target variable
X = data.drop(columns=['id', 'stroke'])  # Drop 'id' and 'stroke' columns
y = data['stroke']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict stroke occurrence on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nAccuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)

ModuleNotFoundError: No module named 'seaborn'