In [1]:
# Advanced Data Cleaning Project in Pandas (Google Colab Compatible)

# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chardet

# Step 2: Load Data from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Provide the path to your dataset
file_path = "/content/drive/MyDrive/Data_Analyst/Portfolio_Projects/Projects/Python/Customer_Call_List.xlsx"

df = pd.read_excel(file_path)
df

# Detect file encoding
with open(file_path, 'rb') as f:
    raw_data = f.read(10000)
    result = chardet.detect(raw_data)
    detected_encoding = result['encoding']
    print(f"Detected Encoding: {detected_encoding}")

# Load dataset with detected encoding
df = pd.read_excel(file_path)
df

Mounted at /content/drive
Detected Encoding: None


Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Address,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score
0,0,2022-01-01,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,7.0,4.26,132.01,9.0
1,1,2022-01-31,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False,15.0,,0.0,1.0
2,2,2022-03-02,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True,,9.66,,9.0
3,3,2022-04-01,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True,8.0,6.35,143.57,
4,4,2022-05-01,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True,7.0,14.77,305.47,9.0
5,5,2022-05-31,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,19.0,7.53,64.09,8.0
6,6,2022-06-30,1007,Jeff,Winger,,1209 South Street,No,No,False,11.0,13.04,0.0,1.0
7,7,2022-07-30,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False,11.0,10.52,0.0,8.0
8,8,2022-08-29,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False,4.0,7.31,,8.0
9,9,2022-09-28,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,8.0,1.19,467.0,3.0


In [2]:
# Step 3: Understanding the Dataset
print("\nBasic Info:")
df.info()
print("\nSummary Statistics:")
print(df.describe())
print("\nChecking Missing Values:")
print(df.isnull().sum())



Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Index                        21 non-null     int64         
 1   Date Created                 21 non-null     datetime64[ns]
 2   Customer ID                  21 non-null     int64         
 3   First Name                   21 non-null     object        
 4   Last Name                    20 non-null     object        
 5   Phone Number                 19 non-null     object        
 6   Address                      21 non-null     object        
 7   Paying Customer              21 non-null     object        
 8   Do Not Contact               17 non-null     object        
 9   Not Useful Column            21 non-null     bool          
 10  Number of Calls              18 non-null     float64       
 11  Call Duration Minutes        18 no

In [3]:
# Step 4: Handling Missing Values
# Fill missing numerical values with median
df.fillna(df.median(numeric_only=True), inplace=True)
df=df.fillna('')
df


Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Address,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score
0,0,2022-01-01,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,7.0,4.26,132.01,9.0
1,1,2022-01-31,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False,15.0,6.89,0.0,1.0
2,2,2022-03-02,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True,9.0,9.66,170.255,9.0
3,3,2022-04-01,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True,8.0,6.35,143.57,7.5
4,4,2022-05-01,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True,7.0,14.77,305.47,9.0
5,5,2022-05-31,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True,19.0,7.53,64.09,8.0
6,6,2022-06-30,1007,Jeff,Winger,,1209 South Street,No,No,False,11.0,13.04,0.0,1.0
7,7,2022-07-30,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False,11.0,10.52,0.0,8.0
8,8,2022-08-29,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False,4.0,7.31,170.255,8.0
9,9,2022-09-28,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True,8.0,1.19,467.0,3.0


In [5]:
# Step 5: Standardizing Data Formats
df["Last Name"] = df["Last Name"].str.strip("123._/")
df["Phone Number"] = df["Phone Number"].str.replace('-','')
df["Phone Number"] = df["Phone Number"].str.replace('|','')
df["Phone Number"] = df["Phone Number"].str.replace('/','')
#df["Phone Number"] = df["Phone Number"].str.replace('\','')
df["Do Not Contact"] = df["Do Not Contact"].str.replace('Yes','Y')
df["Do Not Contact"] = df["Do Not Contact"].str.replace('No','N')
df["Paying Customer"] = df["Paying Customer"].str.replace('Yes','Y')
df["Paying Customer"] = df["Paying Customer"].str.replace('No','N')

df[["Street Address", "State", "Zip Code"]] = df["Address"].str.split(',', n=2, expand=True)
df.drop(columns=["Address"], inplace=True)
df


Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score,Street Address,State,Zip Code
0,0,2022-01-01,1001,Frodo,Baggins,1235455421,Y,N,True,7.0,4.26,132.01,9.0,123 Shire Lane,Shire,
1,1,2022-01-31,1002,Abed,Nadir,1236439775,N,Y,False,15.0,6.89,0.0,1.0,93 West Main Street,,
2,2,2022-03-02,1003,Walter,White,,N,,True,9.0,9.66,170.255,9.0,298 Drugs Driveway,,
3,3,2022-04-01,1004,Dwight,Schrute,1235432345,Y,Y,True,8.0,6.35,143.57,7.5,980 Paper Avenue,Pennsylvania,18503.0
4,4,2022-05-01,1005,Jon,Snow,8766783469,Y,N,True,7.0,14.77,305.47,9.0,123 Dragons Road,,
5,5,2022-05-31,1006,Ron,Swanson,3047622467,Y,Y,True,19.0,7.53,64.09,8.0,768 City Parkway,,
6,6,2022-06-30,1007,Jeff,Winger,,N,N,False,11.0,13.04,0.0,1.0,1209 South Street,,
7,7,2022-07-30,1008,Sherlock,Holmes,8766783469,N,N,False,11.0,10.52,0.0,8.0,98 Clue Drive,,
8,8,2022-08-29,1009,Gandalf,,Na,Y,,False,4.0,7.31,170.255,8.0,123 Middle Earth,,
9,9,2022-09-28,1010,Peter,Parker,1235455421,Y,N,True,8.0,1.19,467.0,3.0,25th Main Street,New York,


In [6]:
# Convert date columns to datetime
df["Date Created"] = pd.to_datetime(df["Date Created"])
# Convert numerical column to appropriate format
df["Number of Calls"] = pd.to_numeric(df["Number of Calls"])
df["Call Duration Minutes"] = pd.to_numeric(df["Call Duration Minutes"])
df["Total Spend"] = pd.to_numeric(df["Total Spend"])
df["Customer Satisfaction Score"] = pd.to_numeric(df["Customer Satisfaction Score"])
df

Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score,Street Address,State,Zip Code
0,0,2022-01-01,1001,Frodo,Baggins,1235455421,Y,N,True,7.0,4.26,132.01,9.0,123 Shire Lane,Shire,
1,1,2022-01-31,1002,Abed,Nadir,1236439775,N,Y,False,15.0,6.89,0.0,1.0,93 West Main Street,,
2,2,2022-03-02,1003,Walter,White,,N,,True,9.0,9.66,170.255,9.0,298 Drugs Driveway,,
3,3,2022-04-01,1004,Dwight,Schrute,1235432345,Y,Y,True,8.0,6.35,143.57,7.5,980 Paper Avenue,Pennsylvania,18503.0
4,4,2022-05-01,1005,Jon,Snow,8766783469,Y,N,True,7.0,14.77,305.47,9.0,123 Dragons Road,,
5,5,2022-05-31,1006,Ron,Swanson,3047622467,Y,Y,True,19.0,7.53,64.09,8.0,768 City Parkway,,
6,6,2022-06-30,1007,Jeff,Winger,,N,N,False,11.0,13.04,0.0,1.0,1209 South Street,,
7,7,2022-07-30,1008,Sherlock,Holmes,8766783469,N,N,False,11.0,10.52,0.0,8.0,98 Clue Drive,,
8,8,2022-08-29,1009,Gandalf,,Na,Y,,False,4.0,7.31,170.255,8.0,123 Middle Earth,,
9,9,2022-09-28,1010,Peter,Parker,1235455421,Y,N,True,8.0,1.19,467.0,3.0,25th Main Street,New York,


In [11]:
df=df.fillna('')

# Step 6: Detect & Remove Duplicates
df.drop_duplicates(inplace=True)
for x in df.index:
    if df.loc[x, "Do Not Contact"] == 'Y':
        df.drop(x, inplace=True)

for x in df.index:
    if df.loc[x, "Not Useful Column"] == True:
        df.drop(x, inplace=True)

for x in df.index:
    if df.loc[x, "Phone Number"] == '':
        df.drop(x, inplace=True)

for x in df.index:
    if df.loc[x, "Phone Number"] == 'N/a':
        df.drop(x, inplace=True)

for x in df.index:
    if df.loc[x, "Phone Number"] == 'na':
        df.drop(x, inplace=True)


df

Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score,Street Address,State,Zip Code
0,7,2022-07-30,1008,sherlock,holmes,8766783469,n,n,False,11.0,10.52,0.0,8.0,98 clue drive,,
2,12,2022-12-27,1013,don,draper,1235432345,y,n,False,12.0,6.4,306.7,3.0,2039 main street,,
3,13,2023-01-26,1014,leslie,knope,8766783469,y,n,False,6.0,1.22,284.38,3.0,343 city parkway,,
4,14,2023-02-25,1015,toby,flenderson,3047622467,n,n,False,2.0,4.23,0.0,1.0,214 hr avenue,,
5,15,2023-03-27,1016,ron,weasley,1235455421,n,n,False,1.0,4.37,0.0,5.0,2395 hogwarts avenue,,
6,16,2023-04-26,1017,michael,scott,1236439775,y,n,False,12.0,10.57,170.255,10.0,121 paper avenue,pennsylvania,


In [12]:
# Step 7: Cleaning String Data
str_cols = df.select_dtypes(include=['object']).columns
for col in str_cols:
    df[col] = df[col].str.strip().str.lower().str.replace("[^a-zA-Z0-9 ]", "", regex=True)

df


Unnamed: 0,Index,Date Created,Customer ID,First Name,Last Name,Phone Number,Paying Customer,Do Not Contact,Not Useful Column,Number of Calls,Call Duration Minutes,Total Spend,Customer Satisfaction Score,Street Address,State,Zip Code
0,7,2022-07-30,1008,sherlock,holmes,8766783469,n,n,False,11.0,10.52,0.0,8.0,98 clue drive,,
2,12,2022-12-27,1013,don,draper,1235432345,y,n,False,12.0,6.4,306.7,3.0,2039 main street,,
3,13,2023-01-26,1014,leslie,knope,8766783469,y,n,False,6.0,1.22,284.38,3.0,343 city parkway,,
4,14,2023-02-25,1015,toby,flenderson,3047622467,n,n,False,2.0,4.23,0.0,1.0,214 hr avenue,,
5,15,2023-03-27,1016,ron,weasley,1235455421,n,n,False,1.0,4.37,0.0,5.0,2395 hogwarts avenue,,
6,16,2023-04-26,1017,michael,scott,1236439775,y,n,False,12.0,10.57,170.255,10.0,121 paper avenue,pennsylvania,


In [13]:
# Step 8: Final Dataset
df = df.reset_index(drop=True)
df

print("\nFinal Cleaned Data:")
#print(df.head())

# Save cleaned data
df.to_csv('/content/drive/My Drive/cleaned_dataset.csv', index=False)

print("Data Cleaning Completed! 🚀")



Final Cleaned Data:
Data Cleaning Completed! 🚀
