In [1]:
import pandas as pd
import numpy as np

# Show all rows
pd.set_option('display.max_rows', None)

# Load data
df = pd.read_csv("C:/Users/prave/Downloads/food_coded.csv")

# Show before cleaning (entire DataFrame subset)
print("Before Cleaning:")
print(df[['GPA', 'weight', 'calories_day', 'comfort_food_reasons']])

# Clean GPA and weight
df['GPA'] = pd.to_numeric(df['GPA'], errors='coerce')
df['weight'] = df['weight'].astype(str).str.extract(r'(\d+\.?\d*)')
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

# Drop columns with more than 20% missing values
df = df.dropna(thresh=len(df) * 0.8, axis=1)

# Drop duplicate column if exists
if 'comfort_food_reasons_coded.1' in df.columns:
    df = df.drop(columns=['comfort_food_reasons_coded.1'])

# Fill missing numeric with mean
for col in df.select_dtypes(include='number'):
    df[col].fillna(df[col].mean(), inplace=True)

# Fill missing categorical with mode
for col in df.select_dtypes(include='object'):
    df[col].fillna(df[col].mode()[0], inplace=True)

# Show after cleaning (entire DataFrame subset)
print("\nAfter Cleaning:")
print(df[['GPA', 'weight', 'calories_day', 'comfort_food_reasons']])

Before Cleaning:
            GPA                    weight  calories_day  \
0           2.4                       187           NaN   
1         3.654                       155           3.0   
2           3.3  I'm not answering this.            4.0   
3           3.2             Not sure, 240           3.0   
4           3.5                       190           2.0   
5          2.25                       190           3.0   
6           3.8                       180           3.0   
7           3.3                       137           3.0   
8           3.3                       180           NaN   
9           3.3                       125           3.0   
10          3.5                       116           3.0   
11        3.904                       110           4.0   
12          3.4                       264           3.0   
13          3.6                       123           3.0   
14          3.1                       185           3.0   
15          NaN                       1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
