# Practical 2: Heart dataset
### 12 Snehal Rakas
### Practical 2


1. Importing the dataset and Analysis:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
data = pd.read_csv('heart.csv')


In [None]:
print(data.head())


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [None]:
print("\nMissing values in the dataset:")
print(data.isnull().sum())


Missing values in the dataset:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


2. Handle missing values (fill with median):


In [None]:
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

print(data_imputed.isnull().sum())


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [None]:
print("\nMissing values after imputation:")
print(data_imputed.isnull().sum())


Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


3. Convert categorical columns to binary

In [None]:
data_encoded = pd.get_dummies(data_imputed, columns=['cp', 'restecg'], drop_first=True)


In [None]:
print("\nColumns after encoding 'cp' and 'restecg':")
print(data_encoded.columns)


Columns after encoding 'cp' and 'restecg':
Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'slope', 'ca', 'thal', 'target', 'cp_1.0', 'cp_2.0', 'cp_3.0',
       'restecg_1.0', 'restecg_2.0'],
      dtype='object')


In [None]:
print("\nFirst few rows after encoding:")
print(data_encoded.head())


First few rows after encoding:
    age  sex  trestbps   chol  fbs  thalach  exang  oldpeak  slope   ca  thal  \
0  52.0  1.0     125.0  212.0  0.0    168.0    0.0      1.0    2.0  2.0   3.0   
1  53.0  1.0     140.0  203.0  1.0    155.0    1.0      3.1    0.0  0.0   3.0   
2  70.0  1.0     145.0  174.0  0.0    125.0    1.0      2.6    0.0  0.0   3.0   
3  61.0  1.0     148.0  203.0  0.0    161.0    0.0      0.0    2.0  1.0   3.0   
4  62.0  0.0     138.0  294.0  1.0    106.0    0.0      1.9    1.0  3.0   2.0   

   target  cp_1.0  cp_2.0  cp_3.0  restecg_1.0  restecg_2.0  
0     0.0   False   False   False         True        False  
1     0.0   False   False   False        False        False  
2     0.0   False   False   False         True        False  
3     0.0   False   False   False         True        False  
4     0.0   False   False   False         True        False  


4. Splitting the dataset into features (X) and target (y):



In [None]:
X = data_encoded.drop('target', axis=1)
y = data_encoded['target']

In [None]:
print("\nShape of feature matrix (X):", X.shape)
print("Shape of target vector (y):", y.shape)



Shape of feature matrix (X): (1025, 16)
Shape of target vector (y): (1025,)


In [None]:
print("\nFirst few rows of the target (y):")
print(y.head())


First few rows of the target (y):
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: target, dtype: float64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(f"\nTraining feature set shape: {X_train.shape}, Testing feature set shape: {X_test.shape}")
print(f"Training target set shape: {y_train.shape}, Testing target set shape: {y_test.shape}")


Training feature set shape: (820, 16), Testing feature set shape: (205, 16)
Training target set shape: (820,), Testing target set shape: (205,)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print(f'Training data shape: {X_train_scaled.shape}, Testing data shape: {X_test_scaled.shape}')


Training data shape: (820, 16), Testing data shape: (205, 16)


In [None]:
print("\nFirst few rows of scaled training data:")
print(X_train_scaled[:5])

print("\nFirst few rows of scaled testing data:")
print(X_test_scaled[:5])


First few rows of scaled training data:
[[-5.85840222e-01  6.54653671e-01 -7.79453566e-01 -1.93503098e+00
  -4.14039336e-01 -1.01909426e+00 -7.25948939e-01 -2.10661213e-01
   1.00526437e+00  2.17169136e+00 -5.45193165e-01 -4.30082664e-01
   1.60648250e+00 -2.93415602e-01 -9.80674507e-01 -1.21866670e-01]
 [ 1.05147737e+00 -1.52752523e+00  2.74173173e+00  1.61063407e+00
  -4.14039336e-01  2.02882145e-01  1.37750735e+00 -9.12152360e-01
   1.00526437e+00 -7.25467395e-01 -5.45193165e-01 -4.30082664e-01
  -6.22477991e-01 -2.93415602e-01  1.01970633e+00 -1.21866670e-01]
 [-4.00676907e-02 -1.52752523e+00 -1.34738668e+00  4.42176271e-01
  -4.14039336e-01  7.70228333e-01 -7.25948939e-01 -9.12152360e-01
   1.00526437e+00 -7.25467395e-01 -5.45193165e-01 -4.30082664e-01
   1.60648250e+00 -2.93415602e-01 -9.80674507e-01 -1.21866670e-01]
 [ 5.05704840e-01  6.54653671e-01  1.86032724e-01 -2.22635925e-01
  -4.14039336e-01  5.08376246e-01 -7.25948939e-01 -4.73720393e-01
  -6.40078509e-01 -7.25467395e-0

# Practical 2: Titanic dataset

1. Importing the dataset and Analysis:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


In [None]:
data = pd.read_csv('titanic.csv')


In [None]:
print("First few rows of the dataset:")
print(data.head())


First few rows of the dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            3734

2. Drop irrelevant columns:


In [None]:
print("\nColumns after dropping irrelevant features:")
print(data.columns)



Columns after dropping irrelevant features:
Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')


In [None]:
data = data.drop(['Embarked'], axis=1)


In [None]:
print("\nColumns after dropping 'Embarked':")
print(data.columns)


Columns after dropping 'Embarked':
Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')


3. Handle missing values (Fill with median or most frequent):


In [None]:
print("\nMissing values in the dataset:")
print(data.isnull().sum())



Missing values in the dataset:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [None]:
imputer = SimpleImputer(strategy='median')
data[['Age', 'Fare']] = imputer.fit_transform(data[['Age', 'Fare']])


In [None]:
data_encoded = pd.get_dummies(data, columns=['Sex'], drop_first=True)


In [None]:
print("\nColumns after encoding 'Sex':")
print(data_encoded.columns)


Columns after encoding 'Sex':
Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male'], dtype='object')


In [None]:
print("\nMissing values after imputation:")
print(data.isnull().sum())



Missing values after imputation:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64


3. Splitting the dataset into features and target:


In [None]:
X = data_encoded.drop('Survived', axis=1)
y = data_encoded['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining feature set shape: {X_train.shape}, Testing feature set shape: {X_test.shape}")



Training feature set shape: (712, 6), Testing feature set shape: (179, 6)


4. Standardizing the data:


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
print("\nFirst few rows of scaled training data:")
print(X_train_scaled[:5])



First few rows of scaled training data:
[[-1.61413602  1.25364106 -0.47072241 -0.47934164 -0.07868358  0.7243102 ]
 [-0.40055118 -0.47728355 -0.47072241 -0.47934164 -0.37714494  0.7243102 ]
 [ 0.81303367  0.21508629 -0.47072241 -0.47934164 -0.47486697  0.7243102 ]
 [ 0.81303367 -0.24649361  0.37992316 -0.47934164 -0.47623026  0.7243102 ]
 [ 0.81303367 -1.78509326  2.93185988  2.04874166 -0.02524937 -1.38062393]]


In [None]:
print("\nFirst few rows of scaled testing data:")
print(X_test_scaled[:5])


First few rows of scaled testing data:
[[ 0.81303367 -0.09263364  0.37992316  0.78470001 -0.33390078  0.7243102 ]
 [-0.40055118  0.13815631 -0.47072241 -0.47934164 -0.42528387  0.7243102 ]
 [ 0.81303367 -0.7080735  -0.47072241 -0.47934164 -0.47486697  0.7243102 ]
 [-0.40055118 -1.78509326 -0.47072241  0.78470001  0.00796649 -1.38062393]
 [ 0.81303367 -1.1696534   0.37992316 -0.47934164 -0.41100201 -1.38062393]]
