In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load datasets
df_calories = pd.read_csv("calories.csv")
df_exercise = pd.read_csv("exercise_dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'calories.csv'

 Imported Pandas and the required  training datasets 

In [None]:
print(df_calories.head())
print(df_exercise.head())   #Testing if dataset is getting displayed 

In [None]:
print(df_calories.info())
print(df_exercise.info())

### Data cleaning and Handling missing values 
Removing rows which might have missing values . 
Alternately missing values can be filled with the mean, median, or mode, depending on the data type: df.fillna(df.mean(), inplace=True)  # For numerical columns
In this scenario , i am dropping the missing values . 
Also , Z-Score can be used to remove any outliers (Values which might the model slow or give inaccurate answers ) 


In [None]:
df_exercise.dropna(inplace=True)
df_calories.dropna(inplace=True)
df_exercise.drop_duplicates(inplace=True) # Removing Duplicate Values
df_calories.drop_duplicates(inplace=True)

### Starting the Merge procedure for both the datasets 

In [None]:
# Convert weight columns in df_exercise to strings for proper matching
weight_columns = ['130 lb', '155 lb', '180 lb', '205 lb']
df_exercise.rename(columns={col: col.replace(" ", "") for col in weight_columns}, inplace=True)

In [None]:
# Define function to map weight to category
def map_weight_to_column(weight):
    if weight <= 130:
        return '130lb'
    elif weight <= 155:
        return '155lb'
    elif weight <= 180:
        return '180lb'
    elif weight <= 205:
        return '205lb'
    else:
        return '205lb'

In [None]:
# Apply mapping
df_calories['Weight_Category'] = df_calories['Weight'].apply(map_weight_to_column)

In [None]:
# Randomly assign an Activity to each user , since activities are not yet assigned to users from first dataset 
df_calories['Activity'] = np.random.choice(df_exercise['Activity'], size=len(df_calories), replace=True)


In [None]:
df_calories['Weight_Category'] = df_calories['Weight_Category'].astype(str)
df_exercise.columns = df_exercise.columns.astype(str)  # Convert all column names to strings


In [None]:
# Convert column names in df_exercise to strings
df_exercise.columns = df_exercise.columns.astype(str)

# Merge only on 'Activity'
df_merged = df_calories.merge(df_exercise, how='left', on='Activity')

# Select the correct column dynamically using .apply() and .get()
df_merged['Calories_Burned'] = df_merged.apply(lambda row: row.get(str(row['Weight_Category']), None), axis=1)


In [None]:
print(df_merged) # Checking if merged  data getting displayed or not 

### Starting with feature engineering 

In [None]:
df_merged['Total_Calories_Burned'] = df_merged['Calories_Burned'] * df_merged['Duration'] #Calculating total calories burned 


In [None]:
missing_data = df_merged[df_merged['Calories_Burned'].isnull()]
print(missing_data)
# checking missing values in new merged file 

Missing values can either be dropped or Replaced by other values .  I am Replacing them with Mean(Average) .


In [None]:
df_merged.groupby('Activity')['Calories_Burned'].sum().plot(kind='bar', figsize=(10, 5))
plt.title('Calories Burned per Activity')
plt.ylabel('Calories Burned')
plt.show()


### Visualizing the data 

status = success

In [None]:
print(df_merged.describe())


### Visualizing some more data to gain insights


In [None]:
# Create the 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Plotting
ax.scatter(df_merged['Duration'], df_merged['Calories_Burned'], df_merged['Weight'], 
           c=df_merged['Calories_Burned'], cmap='coolwarm', s=50)

# Labels
ax.set_xlabel('Duration (mins)')
ax.set_ylabel('Calories Burned')
ax.set_zlabel('Weight (kg)')
ax.set_title('3D Scatter Plot: Duration vs Calories vs Weight')

plt.show()

In [None]:
# Group data
grouped = df_merged.groupby(['Activity', 'Weight_Category'])['Calories_Burned'].sum().reset_index()

# Map categories to numbers
activities = grouped['Activity'].unique()
weights = grouped['Weight_Category'].unique()
x_pos = np.arange(len(activities))
y_pos = np.arange(len(weights))
x, y = np.meshgrid(x_pos, y_pos)

# Reshape data
z = np.zeros_like(x)
dx = dy = 0.5
dz = grouped.pivot(index='Weight_Category', columns='Activity', values='Calories_Burned').fillna(0).values.flatten()

# Plotting
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
ax.bar3d(x.flatten(), y.flatten(), z.flatten(), dx, dy, dz, shade=True, cmap='viridis')

# Labels
ax.set_xticks(x_pos + dx / 2)
ax.set_xticklabels(activities, rotation=45)
ax.set_yticks(y_pos + dy / 2)
ax.set_yticklabels(weights)
ax.set_zlabel('Calories Burned')

plt.title('3D Bar Plot: Calories Burned by Activity & Weight Category')
plt.show()


In [None]:


fig = px.scatter_3d(df_merged, 
                    x='Duration', 
                    y='Calories_Burned', 
                    z='Weight', 
                    color='Activity', 
                    size='Calories_Burned',
                    title='Interactive 3D Scatter Plot')
fig.show()


## Data Preprocessing 
    Involves,
1. Encoding categorical variables ,
2. Feature scaling (KNN or SVM etc)  ,
3. Spliting the dataset(training set and testing set ) 
### Encoding Categorical Variables 
Since we r  having columns like 'Activity' and 'Weight_Category', we'll apply encoding techniques: (so that it can be readily used by our ML model later ) 
- Label Encoding (for ordinal categories, if any).
- One-Hot Encoding (for nominal categories like 'Activity').

In [None]:
# Check for categorical columns
categorical_cols = df_merged.select_dtypes(include=['object']).columns
print(categorical_cols)


In [None]:
# Applying One-hot encoding
df_encoded = pd.get_dummies(df_merged, columns=['Activity', 'Weight_Category'], drop_first=True) #

# Check the new dataframe
print(df_encoded.head())

- Above , drop_first=True helps avoid the dummy variable trap (multicollinearity).
- Multicollinearity occurs when two or more explanatory variables in a multiple regression model are highly linearly related. 

### Splitting the data into Training set and Testing set

In [None]:
X = df_encoded.drop('Calories_Burned', axis=1)
y = df_encoded['Calories_Burned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Selection : Using Linear Regression 

Applying linear regression 

**Fixing non-numeric columns so that Linear Regression can run properly**
**Since Linear Regression requires numeric data to run**

In [None]:
# Identify non-numeric columns
non_numeric_cols = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# Apply one-hot encoding to non-numeric columns
X_train = pd.get_dummies(X_train, columns=non_numeric_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=non_numeric_cols, drop_first=True)

# Align train and test sets to have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = lr_model.predict(X_test)

# Plot actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label='Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Ideal Line')

# Add labels and title
plt.xlabel('Actual Calories Burned')
plt.ylabel('Predicted Calories Burned')
plt.title('Actual vs Predicted Calories Burned (Linear Regression)')
plt.legend()

# Show the plot
plt.show()

**Additional Visualizations**
  1. Residual Plot : helps diagnose model performance
  2. Feature Importance Plot :  shows the contribution of each feature to the model.

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, color='green', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Calories Burned')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
coefficients = lr_model.coef_
feature_names = X_train.columns

plt.figure(figsize=(10, 6))
plt.barh(feature_names, coefficients, color='purple')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance (Linear Regression Coefficients)')
plt.show()

**As can be seen above , my feature importance plot isn't showing proper values or the coffecients seems off**
Reasons might be Scaling Issues or multicollinearity . This case feels as a bit of scaling problem . 
scaling issues : Linear regression coefficients are sensitive to the scale of the features. If some features are on a much larger scale than others, their coefficients might dominate the plot.
Solution will be to scale my features before fitting the model (e.g., using StandardScaler or MinMaxScaler).

**Using StandardScaler** To Scale properly and visualize the feature plot correctly 
 

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit the linear regression model on scaled data
lr_model_scaled = LinearRegression()
lr_model_scaled.fit(X_train_scaled, y_train)

# Get coefficients and feature names
coefficients = lr_model_scaled.coef_
feature_names = X_train.columns

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names, coefficients, color='purple')
plt.xlabel('Coefficient Value (Scaled Features)')
plt.ylabel('Feature')
plt.title('Feature Importance (Linear Regression Coefficients)')
plt.show()

**SUCCESS**

Now We have to Evaluate our model to check how well it is performing 
Evaluation of model here can be done by 
1. MSE
2. RMSE
3. R²
 
**Evaluating the model using Mean Squared error on the Test Set**  
Calculating square root of MSE to get RMSE 

In [None]:
# Predicting on the test set
y_pred = lr_model.predict(X_test)

# Calculating evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Displaying the results
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


**Detailed Evaluation and Visualization using all three: plus residual analysis is also used**

In [None]:
# Predictions
y_pred = lr_model.predict(X_test)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# R-squared (R²) Score
r2 = r2_score(y_test, y_pred)

# Residual Analysis
residuals = y_test - y_pred

# Plotting Residuals
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Analysis')
plt.show()

# Printing the Metrics
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')


**Optionally we can Check for outliers if any and re-evaluate the model**

Steps to Fix Outliers and Re-evaluate the Model :
1. **Detect Outliers**
Use statistical methods like Z-score, IQR (Interquartile Range), or visualization techniques like boxplots to detect outliers.

from scipy.stats import zscore
#### Calculate Z-scores for the features
z_scores = zscore(X_train)

#### Define a threshold (e.g., 3 or -3)
threshold = 3
outliers = (z_scores > threshold) | (z_scores < -threshold)

#### Print number of outliers
print("Number of outliers:", outliers.sum())

## Alternatively, use the IQR method:
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

#### Define outliers
outliers = ((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))).any(axis=1)
print("Number of outliers:", outliers.sum())

2. **Handle Outliers**
We can either remove outliers or transform them (e.g., using log transformation or capping).

Option 1: Remove Outliers
X_train_no_outliers = X_train[~outliers]

y_train_no_outliers = y_train[~outliers]

Option 2: Transform Outliers
Cap outliers to a specific value (e.g., 95th percentile).
upper_limit = X_train.quantile(0.95)
lower_limit = X_train.quantile(0.05)
X_train_capped = X_train.clip(lower_limit, upper_limit, axis=1)


3. **Re-train the Model**
Train your linear regression model on the cleaned dataset (without outliers or with transformed outliers).

from sklearn.linear_model import LinearRegression
lr_model_no_outliers = LinearRegression()
lr_model_no_outliers.fit(X_train_no_outliers, y_train_no_outliers)

4. **Re-evaluate the Model**
Evaluate the model on the test set (or use cross-validation) and compare the metrics with the previous results.
from sklearn.metrics import mean_squared_error, r2_score
#### Predict on the test set
y_pred_no_outliers = lr_model_no_outliers.predict(X_test)
#### Calculate metrics
mse = mean_squared_error(y_test, y_pred_no_outliers)
r2 = r2_score(y_test, y_pred_no_outliers)
print("Mean Squared Error (after fixing outliers):", mse)
print("R² Score (after fixing outliers):", r2)


**Expected Changes in Metrics**

R² Score: If outliers were negatively affecting the model, fixing them may improve the R² score (closer to 1).

Mean Squared Error (MSE): Outliers often increase MSE. Fixing them may reduce the MSE.

Mean Absolute Error (MAE): Similar to MSE, MAE may also decrease after handling outliers.

Validating the model on **new** data

- We can test the model on new dataset(if available) to ensure it generalizes  well
- Since i can't find any new dataset for this , we can use  **cross-validation** to further validate the model's performance. 

In [None]:
from sklearn.model_selection import cross_val_score
# Perform 5-fold cross-validation
scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='r2')
print("Cross-Validation R² Scores:", scores)
print("Mean R²:", scores.mean())

**As can be seen above we got good and pretty much decent scores now**
since scores seem to be too perfect , this indicates

**Potential Overfitting:**
If your model is achieving near-perfect scores, it might be overfitting the training data. Overfitting occurs when the model learns noise or specific patterns in the training data that do not generalize to new data.

**Data Leakage:**
Ensure there is no data leakage (e.g., the target variable or related information accidentally being included in the features).

**Simple Dataset:**
If your dataset is very simple or has a strong linear relationship between features and the target, such high R² scores might be realistic.

Optionally we can check this , but let it slide for now
