In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('/content/heart.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,1
1,37.0,1.0,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,1
2,41.0,0.0,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,1
3,56.0,1.0,1,120.0,,0,1,,0,0.8,2,0,2,1
4,57.0,0.0,0,120.0,354.0,0,1,163.0,1,0.6,2,0,2,1


In [4]:
df.isnull().sum()

Unnamed: 0,0
age,28
sex,13
cp,0
trestbps,19
chol,16
fbs,0
restecg,0
thalach,19
exang,0
oldpeak,0


In [5]:
df = df.dropna(subset=['age', 'sex','trestbps','chol','thalach'])

In [6]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [7]:
print(df.shape)

(218, 14)


In [8]:

duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {len(duplicate_rows)}")


df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

print(df.info())
print(df.isnull().sum())

Number of duplicate rows: 1
Shape after removing duplicates: (217, 14)
<class 'pandas.core.frame.DataFrame'>
Index: 217 entries, 0 to 300
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       217 non-null    float64
 1   sex       217 non-null    float64
 2   cp        217 non-null    int64  
 3   trestbps  217 non-null    float64
 4   chol      217 non-null    float64
 5   fbs       217 non-null    int64  
 6   restecg   217 non-null    int64  
 7   thalach   217 non-null    float64
 8   exang     217 non-null    int64  
 9   oldpeak   217 non-null    float64
 10  slope     217 non-null    int64  
 11  ca        217 non-null    int64  
 12  thal      217 non-null    int64  
 13  target    217 non-null    int64  
dtypes: float64(6), int64(8)
memory usage: 25.4 KB
None
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope    

In [9]:
# prompt: Perform integrate

# Assuming you want to integrate the 'age' and 'trestbps' columns
# using numerical integration (e.g., trapezoidal rule).

from scipy.integrate import trapz

# Extract relevant columns and convert to numpy arrays
age = df['age'].to_numpy()
trestbps = df['trestbps'].to_numpy()

# Calculate the integral using the trapezoidal rule
integral_result = trapz(trestbps, age)

print(f"The integral of trestbps with respect to age is: {integral_result}")


The integral of trestbps with respect to age is: -617.5


  integral_result = trapz(trestbps, age)


In [10]:
# prompt: peroform transformation

# Assuming you want to transform the 'age' column by
# applying a logarithmic transformation.

df['age_log'] = np.log(df['age'])

# Print the transformed DataFrame
print(df.head())

# You can also perform other transformations, such as:
# - Standardization: (x - mean) / std
# - Min-Max scaling: (x - min) / (max - min)
# - Box-Cox transformation
# - etc.

    age  sex  cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0   3     145.0  233.0    1        0    150.0      0      2.3   
1  37.0  1.0   2     130.0  250.0    0        1    187.0      0      3.5   
2  41.0  0.0   1     130.0  204.0    0        0    172.0      0      1.4   
4  57.0  0.0   0     120.0  354.0    0        1    163.0      1      0.6   
6  56.0  0.0   1     140.0  294.0    0        0    153.0      0      1.3   

   slope  ca  thal  target   age_log  
0      0   0     1       1  4.143135  
1      0   0     2       1  3.610918  
2      2   0     2       1  3.713572  
4      2   0     2       1  4.043051  
6      1   0     2       1  4.025352  


In [11]:
# prompt: perform Standardization

from sklearn.preprocessing import StandardScaler

# Assuming you want to standardize the 'age' and 'trestbps' columns.

# Create a StandardScaler object.
scaler = StandardScaler()

# Fit the scaler to the data and transform it.
df[['age_std', 'trestbps_std']] = scaler.fit_transform(df[['age', 'trestbps']])

# Print the transformed DataFrame.
print(df.head())

    age  sex  cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  63.0  1.0   3     145.0  233.0    1        0    150.0      0      2.3   
1  37.0  1.0   2     130.0  250.0    0        1    187.0      0      3.5   
2  41.0  0.0   1     130.0  204.0    0        0    172.0      0      1.4   
4  57.0  0.0   0     120.0  354.0    0        1    163.0      1      0.6   
6  56.0  0.0   1     140.0  294.0    0        0    153.0      0      1.3   

   slope  ca  thal  target   age_log   age_std  trestbps_std  
0      0   0     1       1  4.143135  0.938740      0.751724  
1      0   0     2       1  3.610918 -1.848823     -0.110151  
2      2   0     2       1  3.713572 -1.419967     -0.110151  
4      2   0     2       1  4.043051  0.295456     -0.684734  
6      1   0     2       1  4.025352  0.188242      0.464433  
