In [None]:
Perform the following operations using Python on the Air quality and Heart Diseases data sets
a. Data cleaning
b. Data integration
c. Data transformation
d. Error correcting
e. Data model building

In [1]:
# NumPy: Numerical calculations ke liye use hoti hai (arrays, matrices, etc.)
import numpy as np

# Pandas: Data analysis aur data manipulation ke liye use hoti hai (DataFrames)
import pandas as pd

# Matplotlib: Data visualization ke liye line plots, bar charts banane ke liye use hoti hai
import matplotlib.pyplot as plt

# Seaborn: Advanced visualization ke liye use hoti hai, matplotlib ke upar built hai (e.g., heatmaps, pairplots)
import seaborn as sns

# Scikit-learn se train_test_split: Dataset ko training aur testing part mein divide karne ke liye
from sklearn.model_selection import train_test_split

# StandardScaler: Features ko scale/normalize karne ke liye (mean = 0, std = 1)
from sklearn.preprocessing import StandardScaler

# Accuracy Score aur Confusion Matrix: Model ki performance evaluate karne ke liye
from sklearn.metrics import accuracy_score, confusion_matrix

# Logistic Regression: Ek supervised machine learning model, jo classification problems solve karta hai
from sklearn.linear_model import LogisticRegression

In [2]:
# 2. Load the Heart Disease Dataset
df = pd.read_csv("heart_diese.csv")  # CSV file ko DataFrame mein read karna

In [3]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [4]:
# 3. Data Cleaning 🧹
# Null values check karo
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [5]:
# Duplicate rows ko remove karo
df.dropna(inplace=True)
df.drop_duplicates(inplace=True,ignore_index=True)

In [6]:
df


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,300,68,Male,Cleveland,asymptomatic,144.0,193.0,True,normal,141.0,False,3.4,flat,2.0,reversable defect,2
295,301,57,Male,Cleveland,asymptomatic,130.0,131.0,False,normal,115.0,True,1.2,flat,1.0,reversable defect,3
296,302,57,Female,Cleveland,atypical angina,130.0,236.0,False,lv hypertrophy,174.0,False,0.0,flat,1.0,normal,1
297,509,47,Male,Hungary,asymptomatic,150.0,226.0,False,normal,98.0,True,1.5,flat,0.0,reversable defect,1


In [7]:
 # Data types check karo
print("\nData Types:\n", df.dtypes)


Data Types:
 id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object


In [8]:
#DATA INTEGRATION=
df1 = df[['age', 'cp', 'chol', 'thalch']]
df2 = df[['exang', 'slope', 'num']]

# Integrating data using column-wise concatenation
merged_df = pd.concat([df1, df2], axis=1)

# Showing result
print("Shape of Merged DataFrame:", merged_df.shape)
print("Columns:", merged_df.columns.tolist())

Shape of Merged DataFrame: (299, 7)
Columns: ['age', 'cp', 'chol', 'thalch', 'exang', 'slope', 'num']


In [9]:
# Check for impossible values and print feedback ERROR CORRECTION
def check_and_correct(df):
    # Check 'trestbps' (blood pressure) within human physiological range
    invalid_trestbps = df[(df['trestbps'] <= 70) | (df['trestbps'] >= 250)]
    if not invalid_trestbps.empty:
        print(f"Invalid 'trestbps' values found: \n{invalid_trestbps}")
        # Optionally, replace invalid values with NaN or a valid range
        df.loc[(df['trestbps'] <= 70) | (df['trestbps'] >= 250), 'trestbps'] = None  # Or some default value

    # Check for negative or impossible 'age' and 'chol' values
    invalid_age = df[df['age'] <= 0]
    if not invalid_age.empty:
        print(f"Invalid 'age' values found: \n{invalid_age}")
        df.loc[df['age'] <= 0, 'age'] = None  # Replace invalid with NaN or a default

    invalid_chol = df[df['chol'] <= 0]
    if not invalid_chol.empty:
        print(f"Invalid 'chol' values found: \n{invalid_chol}")
        df.loc[df['chol'] <= 0, 'chol'] = None  # Replace invalid with NaN or a default
    
    return df

# Apply error correction
data = check_and_correct(df)

# Drop rows with NaN values (or fill them, depending on your preference)
data = df.dropna(subset=['trestbps', 'age', 'chol'])

In [10]:
data

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,300,68,Male,Cleveland,asymptomatic,144.0,193.0,True,normal,141.0,False,3.4,flat,2.0,reversable defect,2
295,301,57,Male,Cleveland,asymptomatic,130.0,131.0,False,normal,115.0,True,1.2,flat,1.0,reversable defect,3
296,302,57,Female,Cleveland,atypical angina,130.0,236.0,False,lv hypertrophy,174.0,False,0.0,flat,1.0,normal,1
297,509,47,Male,Hungary,asymptomatic,150.0,226.0,False,normal,98.0,True,1.5,flat,0.0,reversable defect,1


In [11]:
#DATA TRANSFORMATION - to remove outliers and normalise data
# Encoding categorical columns
df['sex'] = df['sex'].map({'male': 1, 'female': 0})
df['fbs'] = df['fbs'].astype(bool).astype(int)
df['exang'] = df['exang'].astype(bool).astype(int)

# One-hot encoding for multi-category columns
df = pd.get_dummies(df, columns=['cp', 'restecg', 'slope', 'thal', 'dataset'], drop_first=True)

# Normalize numeric features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['age', 'trestbps', 'chol', 'thalch', 'oldpeak']] = scaler.fit_transform(df[['age', 'trestbps', 'chol', 'thalch', 'oldpeak']])

In [12]:
df

Unnamed: 0,id,age,sex,trestbps,chol,fbs,thalch,exang,oldpeak,ca,...,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect,dataset_Hungary,dataset_VA Long Beach
0,1,0.708333,,0.481132,0.286638,1,0.603053,0,0.370968,0.0,...,False,True,False,False,False,False,False,False,False,False
1,2,0.791667,,0.622642,0.400862,0,0.282443,1,0.241935,3.0,...,False,False,False,False,True,False,True,False,False,False
2,3,0.791667,,0.245283,0.278017,0,0.442748,1,0.419355,2.0,...,False,False,False,False,True,False,False,True,False,False
3,4,0.166667,,0.339623,0.323276,0,0.885496,0,0.564516,0.0,...,True,False,True,False,False,False,True,False,False,False
4,5,0.250000,,0.339623,0.224138,0,0.770992,0,0.225806,0.0,...,False,False,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,300,0.812500,,0.471698,0.200431,1,0.534351,0,0.548387,2.0,...,False,False,True,False,True,False,False,True,False,False
295,301,0.583333,,0.339623,0.066810,0,0.335878,1,0.193548,1.0,...,False,False,True,False,True,False,False,True,False,False
296,302,0.583333,,0.339623,0.293103,0,0.786260,0,0.000000,1.0,...,False,False,False,False,True,False,True,False,False,False
297,509,0.375000,,0.528302,0.271552,0,0.206107,1,0.241935,0.0,...,False,False,True,False,True,False,False,True,True,False


In [13]:
data

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,300,68,Male,Cleveland,asymptomatic,144.0,193.0,True,normal,141.0,False,3.4,flat,2.0,reversable defect,2
295,301,57,Male,Cleveland,asymptomatic,130.0,131.0,False,normal,115.0,True,1.2,flat,1.0,reversable defect,3
296,302,57,Female,Cleveland,atypical angina,130.0,236.0,False,lv hypertrophy,174.0,False,0.0,flat,1.0,normal,1
297,509,47,Male,Hungary,asymptomatic,150.0,226.0,False,normal,98.0,True,1.5,flat,0.0,reversable defect,1


In [19]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Assuming df is already loaded
# Create binary target column
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

# Define features and target
X = df.drop(columns=['id', 'num', 'target'], errors='ignore')
y = df['target']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Logistic Regression
print("Logistic Regression Performance:")
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_imputed, y_train)
y_pred_log = log_model.predict(X_test_imputed)
print(classification_report(y_test, y_pred_log))

# Decision Tree Classifier
print("Decision Tree Classifier Performance:")
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_imputed, y_train)
y_pred_dt = dt_model.predict(X_test_imputed)
print(classification_report(y_test, y_pred_dt))

# Random Forest Classifier
print("Random Forest Classifier Performance:")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_imputed, y_train)
y_pred_rf = rf_model.predict(X_test_imputed)
print(classification_report(y_test, y_pred_rf))

# Optional: Evaluate with regression metrics if needed (not recommended for classification)
print("Random Forest Regression-style Metrics (for analysis only):")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred_rf)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_pred_rf)}")
print(f"R-squared: {r2_score(y_test, y_pred_rf)}")


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.94      0.89      0.91        35
           1       0.85      0.92      0.88        25

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60

Decision Tree Classifier Performance:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        35
           1       0.75      0.72      0.73        25

    accuracy                           0.78        60
   macro avg       0.78      0.77      0.78        60
weighted avg       0.78      0.78      0.78        60

Random Forest Classifier Performance:
              precision    recall  f1-score   support

           0       0.86      0.91      0.89        35
           1       0.87      0.80      0.83        25

    accuracy                           0.87        60
   macro avg       

