In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load CSV from Google Drive
file_path = '/content/drive/My Drive/customers-100000.csv'
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,ffeCAb7AbcB0f07,Jared,Jarvis,Sanchez-Fletcher,Hatfieldshire,Eritrea,274.188.8773x41185,001-215-760-4642x969,gabriellehartman@benjamin.com,2021-11-11,https://www.mccarthy.info/
1,2,b687FfC4F1600eC,Marie,Malone,Mckay PLC,Robertsonburgh,Botswana,283-236-9529,(189)129-8356x63741,kstafford@sexton.com,2021-05-14,http://www.reynolds.com/
2,3,9FF9ACbc69dcF9c,Elijah,Barrera,Marks and Sons,Kimbury,Barbados,8252703789,459-916-7241x0909,jeanettecross@brown.com,2021-03-17,https://neal.com/
3,4,b49edDB1295FF6E,Sheryl,Montgomery,"Kirby, Vaughn and Sanders",Briannaview,Antarctica (the territory South of 60 deg S),425.475.3586,(392)819-9063,thomassierra@barrett.com,2020-09-23,https://www.powell-bryan.com/
4,5,3dcCbFEB17CCf2E,Jeremy,Houston,Lester-Manning,South Brianna,Micronesia,+1-223-666-5313x4530,252-488-3850x692,rubenwatkins@jacobs-wallace.info,2020-09-18,https://www.carrillo.com/


***DATA CLEANING***

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Fill numeric columns with median
numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    median_value = df[col].median()
    df[col].fillna(median_value, inplace=True)

# Fill object/categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

# Confirm all missing values are handled
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Fill missing numeric values with median
for col in df.select_dtypes(include='number').columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical with mode
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


Missing values per column:
Index                0
Customer Id          0
First Name           0
Last Name            0
Company              0
City                 0
Country              0
Phone 1              0
Phone 2              0
Email                0
Subscription Date    0
Website              0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)



Missing values after cleaning:
Index                0
Customer Id          0
First Name           0
Last Name            0
Company              0
City                 0
Country              0
Phone 1              0
Phone 2              0
Email                0
Subscription Date    0
Website              0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


***DATA TRANSFORMATION***

In [None]:
if 'age' in df.columns:
    bins = [0, 18, 30, 45, 60, 120]
    labels = ['Teen', 'Young Adult', 'Adult', 'Middle Aged', 'Senior']
    df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)



In [None]:
if 'annual_income' in df.columns:
    q1 = df['annual_income'].quantile(0.25)
    q3 = df['annual_income'].quantile(0.75)
    iqr = q3 - q1
    upper_limit = q3 + 1.5 * iqr
    df['annual_income_capped'] = df['annual_income'].apply(lambda x: min(x, upper_limit))

In [None]:
if 'gender' in df.columns:
    df['gender_binary'] = df['gender'].map({'Male': 1, 'Female': 0})


In [None]:
if {'first_name', 'last_name'}.issubset(df.columns):
    df['full_name'] = df['first_name'] + ' ' + df['last_name']


In [None]:
if {'spending_score', 'annual_income'}.issubset(df.columns):
    df['score_per_income'] = df['spending_score'] / df['annual_income'].replace(0, 1)

***Data Saving***

In [None]:
# Save the cleaned data to Google Drive
output_path = '/content/drive/My Drive/cleaned_data.csv'
df.to_csv(output_path, index=False)
print("Cleaned data saved to:", output_path)

Cleaned data saved to: /content/drive/My Drive/cleaned_data.csv
