In [2]:
#  cleaning + before vs after + save cleaned CSV + txt report

from google.colab import files
import pandas as pd, numpy as np
from datetime import datetime
import os

# 1) Load dataset (tries common path)
path_candidates = ['/content/train.csv', '/mnt/data/train.csv']
for p in path_candidates:
    if os.path.exists(p):
        df = pd.read_csv(p); src = p; break
else:
    uploaded = files.upload()
    fn = list(uploaded.keys())[0]
    df = pd.read_csv(fn); src = fn

# 2) Before summary
before_shape = df.shape
before_missing = df.isnull().sum()
before_total_missing = int(before_missing.sum())
before_dups = int(df.duplicated().sum())

print("BEFORE:", before_shape, "| total missing:", before_total_missing, "| duplicates:", before_dups)
print(before_missing[before_missing>0])

# 3) Cleaning (compact)
df_clean = df.drop_duplicates().copy()
num_cols = df_clean.select_dtypes(include=[np.number]).columns
cat_cols = df_clean.select_dtypes(include=['object','category']).columns

# numeric -> mean, categorical -> mode, strip whitespace
for c in num_cols:
    if df_clean[c].isnull().any(): df_clean[c].fillna(df_clean[c].mean(), inplace=True)
for c in cat_cols:
    if df_clean[c].isnull().any():
        try: df_clean[c].fillna(df_clean[c].mode(dropna=True)[0], inplace=True)
        except: df_clean[c].fillna("Unknown", inplace=True)
    df_clean[c] = df_clean[c].apply(lambda x: x.strip() if isinstance(x, str) else x)

# 4) After summary
after_shape = df_clean.shape
after_total_missing = int(df_clean.isnull().sum().sum())
after_dups = int(df_clean.duplicated().sum())

print("\nAFTER:", after_shape, "| total missing:", after_total_missing, "| duplicates:", after_dups)

# 5) Save cleaned CSV and short text report
cleaned_fn = "cleaned_train.csv"
report_fn = "cleaning_report.txt"
df_clean.to_csv(cleaned_fn, index=False)

report_lines = [
    "Data Cleaning Report - Customer Churn",
    f"Source file: {src}",
    f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    "",
    f"BEFORE: rows,cols = {before_shape}; total missing = {before_total_missing}; duplicates = {before_dups}",
]
report_lines += ["Missing per column (before):"] + [f"  {k}: {v}" for k,v in before_missing.items() if v>0]
report_lines += [
    "",
    f"AFTER: rows,cols = {after_shape}; total missing = {after_total_missing}; duplicates = {after_dups}",
    "",
    "Cleaning steps: dropped duplicates, numeric->mean imputation, categorical->mode imputation, stripped whitespace.",
    "",
    "Notes: consider advanced imputation (median/KNN) and encoding before modeling."
]
with open(report_fn,"w",encoding="utf-8") as f:
    f.write("\n".join(report_lines))

# 6) Download files
print(f"\nSaved: {cleaned_fn}, {report_fn}\nStarting downloads...")
files.download(cleaned_fn)
files.download(report_fn)


BEFORE: (891, 12) | total missing: 866 | duplicates: 0
Age         177
Cabin       687
Embarked      2
dtype: int64

AFTER: (891, 12) | total missing: 0 | duplicates: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  if df_clean[c].isnull().any(): df_clean[c].fillna(df_clean[c].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  try: df_clean[c].fillna(df_clean[c].mode(dropna=True)[0], inplace=True)
