In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from importlib import reload

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

#get the column name of data described in the dataset, ignore 0th row.
#dataset = pd.read_csv('C:/Users/ksmin/data_Science/dataset/bmi_data_lab2.csv', names= ['gender', 'age', 'height','weight','BMI'], skiprows=1)
#dataset.head(20)

# File path
file_path = r'C:/Users/ksmin/data_Science/dataset/bmi_data_lab2.csv'

# Read CSV file
try:
    dataset = pd.read_csv(file_path)
    print("File read successfully.")
    # Print the dataframe
    print(dataset.head())
except FileNotFoundError:
    print("File not found.")
except Exception as e:
    print("An error occurred while reading the file:", e)

#Print dataset feature names and data types
#150 rows, 5 columns, null values were confirmed in height and BMI.
dataset.info()

# Print dataset statistical data
print("\nDataset statistical data:")
print(dataset.describe())

#Identify what a dataset is like upper 10
dataset.head(10)

#Identify what a dataset is like the bottom 10
dataset.tail(10)

#check column dataset
dataset.columns

#check dataset index
dataset.index

#check how many NaN each column.
dataset.isna().sum()

#Plot height & weight histograms (bins=10) for each BMI value
status = ['weak', 'normal', 'overweight', 'obesity']
status_indices = [0]*4

for num in dataset['BMI']:
    if num == 1:
        status_indices[0] += 1
    elif num == 2:
        status_indices[1] += 1
    elif num == 3:
        status_indices[2] += 1
    elif num == 4:
        status_indices[3] += 1
    else:
        continue

print(status_indices)

plt.figure(figsize=(6, 8))
plt.title("BMI Distribution")
plt.xlabel("Status")
plt.ylabel("Distribution")
plt.bar(status, status_indices)
plt.show()

# Draw a histogram using seaborn.FacetGrid based on BMI values
grid = sns.FacetGrid(dataset, col="BMI", col_wrap=3)
grid.map(plt.hist, 'Height (Inches)', bins=10, color="red")
grid.map(plt.hist, "Weight (Pounds)", bins=10, color="blue")
plt.show()

# Histogram for Height (Inches) for each BMI value
g = sns.FacetGrid(dataset, col="BMI", margin_titles=True)
g.map(plt.hist, "Height (Inches)", bins=10, color="steelblue")
plt.show()

#Print BMI values each Sex 0 -> Female, 1 -> Male
bins = 10
facet = sns.FacetGrid(dataset, col='Sex')
facet = facet.map(plt.hist, 'BMI', bins=bins)

# Plot height & weight histograms for each BMI value
sns.set(style="whitegrid")

# Histogram for Weight (Pounds) for each BMI value
g = sns.FacetGrid(dataset, col="BMI", margin_titles=True)
g.map(plt.hist, "Weight (Pounds)", bins=10, color="orange")
plt.show()


# Plot scaling results for height and weight
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(dataset[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

# Identify missing values
missing_values = dataset.isnull().sum()
print("Number of rows with missing values:", dataset.isnull().any(axis=1).sum())
print("Number of missing values for each column:")
print(missing_values)

#Convert a string to a number
label_encoder = LabelEncoder()
dataset['Sex'] = label_encoder.fit_transform(dataset['Sex'])

#One-hot encoding to 'sex' column
onehot_encoder = OneHotEncoder()
gender_encoded = onehot_encoder.fit_transform(dataset['Sex'].values.reshape(-1,1))

#save df dataFrame, Sex column has Female -> 1.0, Male -> 0.0
gender_encoded_df = pd.DataFrame(gender_encoded.toarray())
gender_encoded_df.drop(columns=[1], inplace=True)
gender_encoded_df.rename(columns={0:'Sex'}, inplace=True)
df = pd.concat([gender_encoded_df, dataset.iloc[:,1:]],axis=1)
#1.0 -> female, 0.0 : male
print(df.head(20))

#dataset has Female, Male variables.
#df has 1.0, 0.0 variables in Sex column

# Print # of rows with NAN, and # of NAN for each column in Height.
h_indices = []
for index, num in df['Height (Inches)'].items():
    if num <= 0 or num>100 or math.isnan(num):
        h_indices.append(index)
        df.at[index,'Height (Inches)'] = np.nan
print(h_indices) 

#Print # of rows with NAN, and # of NAN for each column in Weight.
w_indices = []
for index, num in df['Weight (Pounds)'].items():
    if num <=0 or num > 200:
        w_indices.append(index)
        df.at[index,'Weight (Pounds)'] = np.nan
print(w_indices) 

# Calculate BMI
def bmiFunc(wt_pound, ht_inch):
    bmi = 703 * wt_pound / (ht_inch**2)
    if bmi < 15.0:
        return 0
    elif bmi >= 15.0 and bmi < 16.0: # 15.024555(#30) ~ 
        return 1
    elif bmi >= 16.0 and bmi < 18.5: # 16.108849(#103) ~ 18.367104(#5)
        return 2
    elif bmi >= 18.5 and bmi < 23: # 18.591242(#40) ~ 22.896143(#46)
        return 3
    elif bmi >= 23:
        return 4

# Replace NAN of BMI
def replace_nan_bmi(row):
    if pd.isnull(row["BMI"]):
        bmi_category = bmiFunc(row["Weight (Pounds)"], row["Height (Inches)"])
        return bmi_category
    else:
        return row["BMI"]
# Remove rows with missing values
df_cleaned = df.dropna()

# Alternatively, fill missing values with mean, median, or using ffill / bfill methods
# Example: fill missing values with mean
df_filled_mean = df
df_filled_mean["Height (Inches)"] = df_filled_mean["Height (Inches)"].fillna(df_filled_mean["Height (Inches)"].mean())
df_filled_mean["Weight (Pounds)"] = df_filled_mean["Weight (Pounds)"].fillna(df_filled_mean["Weight (Pounds)"].mean())
df_filled_mean["BMI"] = df_filled_mean.apply(replace_nan_bmi, axis=1)

# Example: fill missing values with median
df_filled_median = df
df_filled_median["Height (Inches)"] = df_filled_median["Height (Inches)"].fillna(df_filled_median["Height (Inches)"].median())
df_filled_median["Weight (Pounds)"] = df_filled_median["Weight (Pounds)"].fillna(df_filled_median["Weight (Pounds)"].median())
df_filled_median["BMI"] = df_filled_median.apply(replace_nan_bmi, axis=1)

# Example: fill missing values with forward fill (ffill)
df_filled_ffill = df
df_filled_ffill["Height (Inches)"] = df_filled_ffill["Height (Inches)"].ffill( )
df_filled_ffill["Weight (Pounds)"] = df_filled_ffill["Weight (Pounds)"].ffill( )
df_filled_ffill["BMI"] = df_filled_ffill.apply(replace_nan_bmi, axis=1)

# Example: fill missing values with backward fill (bfill)
df_filled_bfill = df
df_filled_bfill["Height (Inches)"] = df_filled_bfill["Height (Inches)"].bfill( )
df_filled_bfill["Weight (Pounds)"] = df_filled_bfill["Weight (Pounds)"].bfill( )
df_filled_bfill["BMI"] = df_filled_bfill.apply(replace_nan_bmi, axis=1)

# Check the cleaned datasets
print("\nCleaned dataset (removed rows with missing values):")
print(df_cleaned.head(10))

print("\nDataset with missing values filled using mean:")
print(df_filled_mean.head(10))

print("\nDataset with missing values filled using median:")
print(df_filled_median.head(10))

print("\nDataset with missing values filled using forward fill:")
print(df_filled_ffill.head(10))

print("\nDataset with missing values filled using backward fill:")
print(df_filled_bfill.head(10))

df.info()

#case 1. drop max num 12(height) feature of dataFrame. 
df_cleaned.info()  

#mean, median, ffill, bfill has all data not NaN
df_filled_mean.info()

# Plot scaling results for height and weight after convert missing,outlier data to NaN.
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(df[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

# Plot scaling results for height and weight after mean
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(df_filled_mean[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

# Plot scaling results for height and weight after drop NaN
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(df_cleaned[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

# Plot scaling results for height and weight after drop NaN
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(df_filled_bfill[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

# Plot scaling results for height and weight after drop NaN
scalers = [('Standard Scaler', StandardScaler()),
           ('Min-Max Scaler', MinMaxScaler()),
           ('Robust Scaler', RobustScaler())]

for scaler_name, scaler in scalers:
    scaled_data = scaler.fit_transform(df_filled_ffill[['Height (Inches)', 'Weight (Pounds)']])
    df_scaled = pd.DataFrame(scaled_data, columns=['Scaled_Height', 'Scaled_Weight'])
    plt.scatter(df_scaled['Scaled_Height'], df_scaled['Scaled_Weight'], label=scaler_name)

plt.xlabel('Scaled Height')
plt.ylabel('Scaled Weight')
plt.title('Scaling Results')
plt.legend()
plt.show()

#Show StandardScaler normalization data case Drop dataset
X = df_cleaned.iloc[:,0:4].values
s_x = StandardScaler().fit_transform(X)
s_x

#Show MinMaxScaler normalize data (0~1) case drop dataset
m_x = MinMaxScaler().fit_transform(X)
m_x

#Show RobustScaler data case Drop dataset
max_x = RobustScaler().fit_transform(X)
max_x

#Case. Clean dataset
plt.title("height, weight Distribution")
plt.xlabel("Height")
plt.ylabel("Weight")
plt.scatter(df['Height (Inches)'],df['Weight (Pounds)'])
plt.show()

#Linear regression model.
def compute_regression_equation(df, name):
    clean_df = df.dropna()
    if clean_df.empty:
        return None, None  # return None if no data
    X = clean_df[['Height (Inches)']]
    y = clean_df[['Weight (Pounds)']]
    model = LinearRegression()
    model.fit(X, y)

    plt.scatter(X,y, color = 'blue', label = 'Data Points')
    plt.plot(X, model.predict(X), color = 'red', label = 'Regression line')
    plt.title("Predicted distribution") 
    plt.xlabel('height')
    plt.ylabel('weight')
    plt.title('Linear Regression : {}' .format(name))
    plt.legend()

    plt.show()
    return model.intercept_, model.coef_[0], model

intercept, slope, model = compute_regression_equation(df,"Total Dataset")
if intercept is None or slope is None:
    print("Entire dataset is empty or insufficient data.")
else:
    # Compute linear regression equations for the entire dataset and for groups divided by gender and BMI
    female_df = df[df['Sex'] == 1.0].dropna(subset=['Height (Inches)', 'Weight (Pounds)']) #Female
    male_df = df[df['Sex'] == 0.0].dropna(subset=['Height (Inches)', 'Weight (Pounds)']) #Male
    obesity_df = df[df['BMI'] == 3.0].dropna(subset=['Height (Inches)', 'Weight (Pounds)'])  # Adjust accordingly for actual BMI categories

    intercept_female, slope_female, model_female = compute_regression_equation(female_df, "Female Dataset")
    intercept_male, slope_male, model_male = compute_regression_equation(male_df, "Male Dataset")
    intercept_obesity, slope_obesity, model_obesity = compute_regression_equation(obesity_df, "BMI Dataset")

    #check Outlier, missing data in height, weight
def Find_h_w(df):
    h_indices = []
    for index, num in df['Height (Inches)'].items():
        if num <= 0 or num>100 or math.isnan(num):
            h_indices.append(index)
            df.at[index,'Height (Inches)'] = np.nan

    w_indices = []
    for index, num in df['Weight (Pounds)'].items():
        if num <=0 or num > 200:
            w_indices.append(index)
            df.at[index,'Weight (Pounds)'] = np.nan
    return h_indices, w_indices

#Store parts that require linear regression for each data set
def Save_linear_regression(df,intercept, slope,model,h_indices,w_indices):
    if not w_indices:  # w_indices empty
        return  # break 
    height = []
    height = df.loc[w_indices, 'Height (Inches)'].values.reshape(-1, 1)
    height = np.array(height).reshape(-1,1) 

    w_predict = model.predict(height) 
    for i, index in enumerate(w_indices):
        df.at[index,'Weight (Pounds)'] = w_predict[i]
    #Predict data, y data is weight. x data is height.
    # y = ax + b, x = (y-b)/a
    weight = []
    weight = df.loc[h_indices,'Weight (Pounds)'].values
    weight = np.array(weight)

    h_predict=(weight-intercept)/slope

    h_predict = h_predict.reshape(-1,1)
    #2차원 배열 w_predict
    for i, index in enumerate(h_indices):
        df.at[index,'Height (Inches)'] = h_predict[i]

#initialize female, male, obesity dataset have NaN, and height, weight NaN index
#initialize female, male, obesity dataset have NaN, and height, weight NaN index
female_df = df.loc[df['Sex'] == 1.0]
if not female_df.empty:
    h_indices_female,w_indices_female = Find_h_w(female_df)

male_df = df.loc[df['Sex'] == 0.0]
if not male_df.empty:
    h_indices_male,w_indices_male = Find_h_w(male_df)

obesity_df = df.loc[df['BMI'] == 3.0]
if not obesity_df.empty:
    h_indices_obesity,w_indices_obesity = Find_h_w(obesity_df)


Save_linear_regression(df,intercept, slope,model,h_indices,w_indices)
Save_linear_regression(female_df,intercept_female, slope_female, model_female, h_indices_female,w_indices_female)
Save_linear_regression(male_df,intercept_male, slope_male,model_male,h_indices_male,w_indices_male)
Save_linear_regression(obesity_df,intercept_obesity, slope_obesity,model_obesity,h_indices_obesity,w_indices_obesity)

def plot_expected(df,model,h_indices,w_indices, name):
    X = df.loc[:, 'Height (Inches)'].values
    Y = df.loc[:, 'Weight (Pounds)'].values
    
    X = X.reshape(-1,1)
    Y = Y.reshape(-1,1)
    
    #Orange -> predict
    P_y = df.loc[h_indices,'Weight (Pounds)'].values 
    P_x = df.loc[h_indices,'Height (Inches)'].values

    P_x = np.append(P_x,df.loc[w_indices,'Height (Inches)'].values) 
    P_y = np.append(P_y,df.loc[w_indices,'Weight (Pounds)'].values)

    P_x = P_x.reshape(-1,1)
    P_y = P_y.reshape(-1,1)

    plt.scatter(X,Y, color = 'blue', label = 'Data Points')
    plt.plot(X, model.predict(X), color = 'red', label = 'Regression line') #regrssion line.
    plt.scatter(P_x,P_y, color = 'orange', label = 'Predicted data')

    plt.title("Predicted distribution : {}".format(name))
    plt.xlabel('height')
    plt.ylabel('weight')
    plt.legend()

    # print graph and expected P_x, P_y
    plt.show()
    print(P_x)
    print()
    print(P_y)

plot_expected(df,model,h_indices,w_indices, "Total Dataset")
plot_expected(female_df,model_female, h_indices_female,w_indices_female, "Female Dataset")
plot_expected(male_df,model_male,h_indices_male,w_indices_male, "Male Dataset")
plot_expected(obesity_df,model_obesity,h_indices_obesity,w_indices_obesity, "BMI Dataset")

#Check NaN to expected values
df.head(30)

df.isna().sum() #Outlier data were stored through predicted values and were originally processed as NaN values.