# Veri Yükleme ve Ön İşleme

In [659]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif


from sklearn.datasets import load_digits

#veriyi eğitim ve test seti olarak ayırmak için train_test_split
#modelin parametrelerinin kombinasyonlarından en iyi sonucu bulabilme işlemi için GridSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV 

#verileri standardize edip ölçeklendirme için StandardScaler. farklı özelliklerin değerleri arasındaki farkı azaltıp tüm veiriy aynı ölçeğe getirir
from sklearn.preprocessing import StandardScaler, LabelEncoder

#principal component analysis (PCA), veri setindeki boyutlatı en önemli özellikleri seçerek azaltmaya yarar
from sklearn.decomposition import PCA

# support vector machine
from sklearn.svm import SVC

#birçok model tahmininin oy birliği ile karara vardığı yöntem VotingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

#verileri en yakın komşularına göre sınıflandırır
from sklearn.neighbors import KNeighborsClassifier

#modelin doğruluğunu konfüzyon matrisinde değerlendirir, display ise matrisi görselleştirme aracıdır
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [660]:
#load digits fonksiyonu el yazısı rakamlar içeren bir veri setini yükler, bu veri setini modelimizi eğitmede kullanacağız.
digits = load_digits()

# data, feature ları içerir. target ise labelları içerir. x girdileri y çıktıları temsil eder
X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#veriyi standardize etmek için standart scaler nesnesi oluşturulur. bu nesne veriyi ortalama 0 standart sapması 1 olacak şekilde ölçeklendirir
scaler = StandardScaler()
#eğitim verisinin fit (uyum sağlama) işlemi yapılır ve aynı zamanda veriler dönüştürülür. verilerin ölçeklendirilmesi işlemini gerçekleştirir
X_train_scaled = scaler.fit_transform(X_train)
#test verisini sadece dönüştürür ancak eğitim verisindeki bilgiye dayalı olarak yapılır. 
# bu sayede eğitim ve test verisi aynı ölçekleme işlemin etabi tutulur
X_test_scaled = scaler.transform(X_test)

In [661]:
df = pd.read_csv("heart_attack_prediction_dataset.csv")
# DataFrame'in veri tiplerini yazdırmak
print(df.dtypes)


Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [662]:
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


verileri düzenleyelim

In [663]:
#sex
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df.dtypes


Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [664]:
#blood pressure
df["Blood Pressure"].unique

<bound method Series.unique of 0        158/88
1        165/93
2        174/99
3       163/100
4         91/88
         ...   
8758      94/76
8759    157/102
8760     161/75
8761     119/67
8762     138/67
Name: Blood Pressure, Length: 8763, dtype: object>

In [665]:
print(df['Blood Pressure'].isnull().sum())


0


In [666]:
split_columns = df['Blood Pressure'].str.split('/', expand=True)

print(split_columns)

#errors coerce, geçersiz değerleri nan olarak işaretler
df[['Systolic Blood Pressure', 'Diastolic Blood Pressure']] = split_columns
df['Systolic Blood Pressure'] = pd.to_numeric(df['Systolic Blood Pressure'], errors='coerce')
df['Diastolic Blood Pressure'] = pd.to_numeric(df['Diastolic Blood Pressure'], errors='coerce')
df.drop(columns=['Blood Pressure'], inplace=True)
df.dtypes

        0    1
0     158   88
1     165   93
2     174   99
3     163  100
4      91   88
...   ...  ...
8758   94   76
8759  157  102
8760  161   75
8761  119   67
8762  138   67

[8763 rows x 2 columns]


Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent                           object
Hemisphere 

In [667]:
df["Diet"].unique()

array(['Average', 'Unhealthy', 'Healthy'], dtype=object)

In [668]:
#diet için ordinal encoding kullanıyoruz, sıralı kategorik veriler için uygundur
#unhealty:0, average:1, healthy:1
ordinal_map = {'Unhealthy': 0, 'Average': 1, 'Healthy': 2}
df['Diet'] = df['Diet'].map(ordinal_map)

df["Diet"] = df["Diet"].astype(int)

df.dtypes


Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent                           object
Hemisphere 

In [669]:
df.columns

Index(['Patient ID', 'Age', 'Sex', 'Cholesterol', 'Heart Rate', 'Diabetes',
       'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption',
       'Exercise Hours Per Week', 'Diet', 'Previous Heart Problems',
       'Medication Use', 'Stress Level', 'Sedentary Hours Per Day', 'Income',
       'BMI', 'Triglycerides', 'Physical Activity Days Per Week',
       'Sleep Hours Per Day', 'Country', 'Continent', 'Hemisphere',
       'Heart Attack Risk', 'Systolic Blood Pressure',
       'Diastolic Blood Pressure'],
      dtype='object')

In [670]:
#country düzenleyelim
df["Country"].unique()

array(['Argentina', 'Canada', 'France', 'Thailand', 'Germany', 'Japan',
       'Brazil', 'South Africa', 'United States', 'Vietnam', 'China',
       'Italy', 'Spain', 'India', 'Nigeria', 'New Zealand', 'South Korea',
       'Australia', 'Colombia', 'United Kingdom'], dtype=object)

In [671]:
df_encoded = pd.get_dummies(df, columns=['Country'], drop_first=True)

# Sadece One-Hot Encoding yapılan sütunları int'e çevirme
one_hot_columns = [col for col in df_encoded.columns if col.startswith('Country_')]
df_encoded[one_hot_columns] = df_encoded[one_hot_columns].astype(int)

df_encoded.head(10)


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Country_Japan,Country_New Zealand,Country_Nigeria,Country_South Africa,Country_South Korea,Country_Spain,Country_Thailand,Country_United Kingdom,Country_United States,Country_Vietnam
0,BMW7812,67,1,208,72,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CZE1114,21,1,389,98,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,BNI9906,21,0,324,72,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,JLN3497,84,1,383,73,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,GFO8847,66,1,318,93,1,1,1,1,0,...,0,0,0,0,0,0,1,0,0,0
5,ZOO7941,54,0,297,48,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
6,WYV0966,90,1,358,84,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
7,XXM0972,84,1,220,107,0,0,1,1,1,...,1,0,0,0,0,0,0,0,0,0
8,XCQ5937,20,1,145,68,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9,FTJ5456,43,0,248,55,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0


In [672]:
df_encoded.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Continent                           object
Hemisphere                          object
Heart Attac

In [673]:
#continent
df["Continent"].unique()

array(['South America', 'North America', 'Europe', 'Asia', 'Africa',
       'Australia'], dtype=object)

In [674]:
df_encoded2 = pd.get_dummies(df_encoded, columns=['Continent'], drop_first=True)

# Sadece One-Hot Encoding yapılan sütunları int'e çevirme
one_hot_columns = [col for col in df_encoded2.columns if col.startswith('Continent_')]
df_encoded2[one_hot_columns] = df_encoded2[one_hot_columns].astype(int)

df_encoded2.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Country_Spain,Country_Thailand,Country_United Kingdom,Country_United States,Country_Vietnam,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America
0,BMW7812,67,1,208,72,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,CZE1114,21,1,389,98,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
2,BNI9906,21,0,324,72,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,JLN3497,84,1,383,73,1,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,GFO8847,66,1,318,93,1,1,1,1,0,...,0,1,0,0,0,1,0,0,0,0


In [675]:
df_encoded2.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Hemisphere                          object
Heart Attack Risk                    int64
Systolic Bl

In [676]:
#hemisphere
df_encoded2["Hemisphere"].unique()

array(['Southern Hemisphere', 'Northern Hemisphere'], dtype=object)

In [677]:
df_encoded2["Hemisphere"] = df_encoded2["Hemisphere"].map({'Southern Hemisphere': 0, 'Northern Hemisphere': 1})
df_encoded2["Hemisphere"]= df_encoded2["Hemisphere"].astype(int)

In [678]:
df_encoded2.dtypes

Patient ID                          object
Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Hemisphere                           int64
Heart Attack Risk                    int64
Systolic Bl

In [679]:
df = df_encoded2.drop('Patient ID', axis=1)
df.dtypes

Age                                  int64
Sex                                  int64
Cholesterol                          int64
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                 int64
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Hemisphere                           int64
Heart Attack Risk                    int64
Systolic Blood Pressure              int64
Diastolic B

In [680]:
df.to_csv('data-preprocessing.csv', index=False)