# Import libraries and data

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [21]:
df = pd.read_csv(r"C:\Users\Lenovo\Desktop\notboak projects\h project\archive (1)\CVD_cleaned.csv")
df

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308849,Very Good,Within the past year,Yes,No,No,No,No,No,No,Male,25-29,168.0,81.65,29.05,No,4.0,30.0,8.0,0.0
308850,Fair,Within the past 5 years,Yes,No,No,No,No,Yes,No,Male,65-69,180.0,69.85,21.48,No,8.0,15.0,60.0,4.0
308851,Very Good,5 or more years ago,Yes,No,No,No,Yes,"Yes, but female told only during pregnancy",No,Female,30-34,157.0,61.23,24.69,Yes,4.0,40.0,8.0,4.0
308852,Very Good,Within the past year,Yes,No,No,No,No,No,No,Male,65-69,183.0,79.38,23.73,No,3.0,30.0,12.0,0.0


# Exploratory Data Analysis

In [3]:
df.shape

(308854, 19)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [5]:
# Basic statistics of numeric columns
numeric_stats = df.describe()
numeric_stats

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


In [6]:
# Columns with missing values and their counts
missing_values = df.isnull().sum()
print(missing_values)


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64


In [7]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

Number of duplicate rows: 80


Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
46402,Good,Within the past year,Yes,No,No,No,Yes,No,No,Female,18-24,163.0,81.65,30.90,No,0.0,60.0,4.0,4.0
49287,Very Good,Within the past year,Yes,No,No,No,No,No,No,Female,35-39,160.0,72.57,28.34,Yes,0.0,60.0,30.0,4.0
75448,Excellent,Within the past year,Yes,No,No,No,No,No,No,Female,65-69,163.0,61.23,23.17,Yes,0.0,30.0,16.0,0.0
76857,Excellent,Within the past year,Yes,No,No,No,No,No,No,Male,40-44,173.0,81.65,27.37,No,0.0,30.0,8.0,1.0
78871,Good,Within the past year,Yes,No,No,No,No,No,No,Female,75-79,163.0,58.97,22.31,No,0.0,60.0,30.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301474,Good,Within the past year,Yes,No,No,No,No,No,Yes,Female,70-74,173.0,77.11,25.85,No,0.0,30.0,30.0,0.0
303040,Very Good,Within the past year,Yes,No,No,No,No,No,No,Female,55-59,168.0,74.84,26.63,No,0.0,30.0,30.0,4.0
303600,Good,Within the past year,Yes,No,No,No,No,No,No,Female,35-39,157.0,72.57,29.26,No,0.0,4.0,12.0,4.0
303609,Very Good,Within the past year,Yes,No,No,No,No,No,No,Female,70-74,160.0,58.97,23.03,No,0.0,30.0,4.0,4.0


In [8]:
# Remove duplicates and update the DataFrame
df.drop_duplicates(keep='first', inplace=True)


In [9]:
# Get the number of unique values in each column
unique_values_count = df.nunique()

unique_values_count

General_Health                     5
Checkup                            5
Exercise                           2
Heart_Disease                      2
Skin_Cancer                        2
Other_Cancer                       2
Depression                         2
Diabetes                           4
Arthritis                          2
Sex                                2
Age_Category                      13
Height_(cm)                       99
Weight_(kg)                      525
BMI                             3654
Smoking_History                    2
Alcohol_Consumption               31
Fruit_Consumption                 77
Green_Vegetables_Consumption      75
FriedPotato_Consumption           69
dtype: int64

# Visualization

In [38]:
df1=df
df1

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308849,Very Good,Within the past year,Yes,No,No,No,No,No,No,Male,25-29,168.0,81.65,29.05,No,4.0,30.0,8.0,0.0
308850,Fair,Within the past 5 years,Yes,No,No,No,No,Yes,No,Male,65-69,180.0,69.85,21.48,No,8.0,15.0,60.0,4.0
308851,Very Good,5 or more years ago,Yes,No,No,No,Yes,"Yes, but female told only during pregnancy",No,Female,30-34,157.0,61.23,24.69,Yes,4.0,40.0,8.0,4.0
308852,Very Good,Within the past year,Yes,No,No,No,No,No,No,Male,65-69,183.0,79.38,23.73,No,3.0,30.0,12.0,0.0


In [29]:
print(df1.columns)

Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',
       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',
       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',
       'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],
      dtype='object')


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib  # Use joblib for model persistence

def train_and_save_model(df, target_column, model_path):
    # Drop rows with missing values in the selected features
    df = df[['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption', target_column]]
    df = df.dropna()

    # Filter rows with target values other than 'No' or 'Yes'
    df = df[df[target_column].isin(['No', 'Yes'])]

    # Convert target values to numerical labels
    df[target_column] = df[target_column].map({'No': 0, 'Yes': 1})

    # Selecting relevant features for prediction
    features_for_prediction = df[['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']]

    # Target variable
    target = df[target_column]

    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(features_for_prediction, target, test_size=0.2, random_state=42, stratify=target)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    # Create and train the logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)

    # Save the trained model to a file
    joblib.dump(model, model_path)

    # Predict on the validation set
    y_val_pred = model.predict(X_val)

    # Model evaluation on the validation set
    accuracy_val = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy for {target_column}: {accuracy_val:.4f}")

    # Predict on the test set
    y_test_pred = model.predict(X_test)

    # Model evaluation on the test set
    accuracy_test = accuracy_score(y_test, y_test_pred)
    print(f"Test Accuracy for {target_column}: {accuracy_test:.4f}")

    # Classification report on the test set
    print(f"Classification Report for {target_column} on Test Set:")
    print(classification_report(y_test, y_test_pred))

# Example usage:
# Replace 'your_dataset.csv' with the actual file name
df1 =df

# Train and save models for each target column
train_and_save_model(df1, 'Skin_Cancer', 'Skin_Cancer.pkl')
train_and_save_model(df1, 'Other_Cancer', 'Other_Cancer.pkl')
train_and_save_model(df1, 'Depression', 'Depression.pkl')
train_and_save_model(df1, 'Arthritis', 'Arthritis.pkl')
train_and_save_model(df1, 'Diabetes', 'Diabetes.pkl')
train_and_save_model(df1, 'Heart_Disease', 'Heart_Disease.pkl')


Validation Accuracy for Skin_Cancer: 0.9029
Test Accuracy for Skin_Cancer: 0.9029
Classification Report for Skin_Cancer on Test Set:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     27886
           1       0.00      0.00      0.00      3000

    accuracy                           0.90     30886
   macro avg       0.45      0.50      0.47     30886
weighted avg       0.82      0.90      0.86     30886



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy for Other_Cancer: 0.9033
Test Accuracy for Other_Cancer: 0.9033
Classification Report for Other_Cancer on Test Set:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95     27898
           1       0.00      0.00      0.00      2988

    accuracy                           0.90     30886
   macro avg       0.45      0.50      0.47     30886
weighted avg       0.82      0.90      0.86     30886



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy for Depression: 0.7995
Test Accuracy for Depression: 0.7989
Classification Report for Depression on Test Set:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     24696
           1       0.36      0.00      0.01      6190

    accuracy                           0.80     30886
   macro avg       0.58      0.50      0.45     30886
weighted avg       0.71      0.80      0.71     30886

Validation Accuracy for Arthritis: 0.6748
Test Accuracy for Arthritis: 0.6757
Classification Report for Arthritis on Test Set:
              precision    recall  f1-score   support

           0       0.68      0.98      0.80     20779
           1       0.54      0.06      0.11     10107

    accuracy                           0.68     30886
   macro avg       0.61      0.52      0.45     30886
weighted avg       0.64      0.68      0.57     30886

Validation Accuracy for Diabetes: 0.8644
Test Accuracy for Diabetes: 0.8650
Classificatio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
