In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kaggle

In [None]:
!kaggle datasets download -d redwankarimsony/heart-disease-data -p/content/heart-disease --unzip

In [None]:
import pandas as pd
df = pd.read_csv('/content/heart-disease/heart_disease_uci.csv')

In [None]:
df.head()

In [None]:
print(df.columns)

In [None]:
df.isnull().sum()

In [None]:
numeric_cols=df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df[numeric_cols].hist(figsize=(15,10))
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Numeric Feature Correlations')
plt.show()



# **DAY** 02 -Model Training

In [None]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
if 'num' in cat_cols:
    cat_cols.remove('num')

In [None]:
X = df.drop('num', axis=1)
y = (df['num'] > 0).astype(int)

In [None]:
X = pd.get_dummies(X, columns=cat_cols)
print("Final feature columns:",X.columns)

**Day 3: Advanced models & feature engineering**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

new value = (x-mean)/standard deviation

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
#its about caalculation

In [None]:
lr_model = LogisticRegression() #giving admission to a new student
lr_model.fit(X_train_scaled, y_train)


**Model Evaluation**

In [None]:
from sklearn.metrics import accuracy_score , classification_report

In [None]:
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test,y_pred_lr))

**Accuracy score - 75 to 98 considered to be good**

**Day 4: Random Forest, and Feature Importance**

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

In [None]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Feature Importance

In [None]:
feat_imp = pd.Series(rf_model.feature_importances_, index=X.columns)
feat_imp.nlargest(10).plot(kind='barh')
plt.title('Random Forest Feature Importance')
plt.show()

Save the Model

In [None]:
import joblib
joblib.dump(rf_model, 'heart_rf_model.pkl')
joblib.dump(scaler, 'heart_scaler.pkl')

In [None]:
sample = X.head(1)
sample.to_csv('Heart_user_template.csv', index=False)
print("User Template saved as Heart_user_template.csv'")

**Day 05 - User Upload & Prediction**

In [None]:
from google.colab import files
files.upload()

In [None]:
import joblib
import pandas as pd

user_df = pd.read_csv('heart_dataset.csv')

#getting columns list from training data frame
numeric_cols = df.select_dtypes(include='number').columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()
bool_cols = df.select_dtypes(include='bool').columns.tolist()
#dropping columns which are extra in user_df than required to avoid error
numeric_cols = [col for col in numeric_cols if col in user_df.columns]
cat_cols = [col for col in cat_cols if col in user_df.columns]
bool_cols = [col for col in bool_cols if col in user_df.columns]
#fill the missing numeric column, cat column &bool column
user_df[numeric_cols] = user_df[numeric_cols].fillna(user_df[numeric_cols].mean())

for col in cat_cols:
    user_df[col] = user_df[col].fillna('unknown')
for col in bool_cols:
    user_df[col] = user_df[col].astype(int)

#one-hot encoding cat columns
user_df_encoded=pd.get_dummies(user_df, columns=cat_cols)

#align columns
user_df_encoded = user_df_encoded.reindex(columns=X.columns, fill_value=0)

#scale data
scaler = joblib.load('heart_scaler.pkl')
user_df_scaled = scaler.transform(user_df_encoded)

#prediction
model = joblib.load('heart_rf_model.pkl')
prediction = model.predict(user_df_scaled)
user_df['Heart_Disease_Prediction'] = prediction

#print result
print(user_df)