In [None]:
import pandas as pd

## Task 1 — Load and Explore Dataset

df = pd.read_csv("customer_churn_raw.csv")
df.head()#first rows
df.info() #number of rows ,number of columns,datatypes,missing values
df.describe() # abnormal values ,weird min/max ,category distributions
df.isna().sum() #shows where data is missing
df.duplicated().sum()

#Show unique categories
for col in df.select_dtypes(include='object').columns:
  print(col,df[col].unique())

## Task 2 — Data Quality Issues & Cleaning

#Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#Fix missing TotalCharges when Tenure = 0
df.loc[(df['TotalCharges'].isna()) & (df['Tenure'] == 0), 'TotalCharges'] = 0

#Remove duplicates
df.drop_duplicates(inplace=True)

#Clean category text (remove spaces)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()

#Replace long category labels
df.replace("No internet service", "No", inplace=True)
df.replace("No phone service", "No", inplace=True)

#Re-check missing values
df.isna().sum()

#save cleand dataset
df.to_csv("/customer_churn_clean.csv", index=False)





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        1215 non-null   int64  
 1   Age               1168 non-null   float64
 2   Gender            975 non-null    object 
 3   Tenure            1215 non-null   int64  
 4   MonthlyCharges    1215 non-null   float64
 5   TotalCharges      1184 non-null   float64
 6   ContractType      1215 non-null   object 
 7   InternetService   1215 non-null   object 
 8   OnlineBackup      1190 non-null   object 
 9   TechSupport       1193 non-null   object 
 10  PaymentMethod     1215 non-null   object 
 11  PaperlessBilling  1215 non-null   object 
 12  Churn             1215 non-null   int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 123.5+ KB
Gender ['Male' 'Female' nan 'female' 'FEMALE' ' Male ' '?' 'male' ' MALE '
 'Other' ' nan ' ' Female ' 'MALE' ' female ']
Co

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [28]:
import pandas as pd

clean_df = pd.read_csv("/content/customer_churn_clean_final.csv")

#Check missing values
clean_df.isna().sum()

#Check duplicates(int0)
clean_df.duplicated().sum()

#check Totalcharges is numeric(float64)
clean_df['TotalCharges'].dtype

#Check inconsistent categories are gone
    #(
    #You should NOT see:
    #"No internet service"
    #"No phone service"
    #extra spaces
    #weird characters)


clean_df['Gender'] = clean_df['Gender'].str.strip().str.lower()

clean_df['Gender'] = clean_df['Gender'].replace({
    'male': 'Male',
    'female': 'Female',
    'other': 'Other',
    '?': 'Other'
})

# Lowercase + strip
clean_df['ContractType'] = clean_df['ContractType'].str.lower().str.strip()

clean_df['ContractType'] = clean_df['ContractType'].replace({
    'month to month': 'Month-to-Month',
    'month-to-month': 'Month-to-Month',
    'monthly': 'Month-to-Month',
    'm2m': 'Month-to-Month',

    'one year': 'One Year',
    '1 year': 'One Year',
    'oneyear': 'One Year',

    'two year': 'Two Year',
    '2 year': 'Two Year',
    'twoyear': 'Two Year'
})

clean_df['PaymentMethod'] = clean_df['PaymentMethod'].str.lower().str.strip()

clean_df['PaymentMethod'] = clean_df['PaymentMethod'].replace({
    'mailed check': 'Mailed Check',
    'mail check': 'Mailed Check',

    'bank transfer': 'Bank Transfer',
    'banktransfer': 'Bank Transfer',

    'electronic check': 'Electronic Check',
    'electroniccheck': 'Electronic Check',
    'e-check': 'Electronic Check',

    'credit card': 'Credit Card',
    'creditcard': 'Credit Card'
})

clean_df['Gender'] = clean_df['Gender'].fillna(clean_df['Gender'].mode()[0])

clean_df['OnlineBackup'] = clean_df['OnlineBackup'].fillna('No')

clean_df['TechSupport'] = clean_df['TechSupport'].fillna('No')

for col in clean_df.select_dtypes(include='object').columns:
    print(col, clean_df[col].unique())

clean_df.isna().sum()


Gender ['Male' 'Female' 'Other']
ContractType ['Two Year' 'One Year' 'Month-to-Month']
InternetService ['Fiber Optic' 'DSL' 'No']
OnlineBackup ['Yes' 'No']
TechSupport ['No' 'Yes']
PaymentMethod ['Mailed Check' 'Bank Transfer' 'Electronic Check' 'Credit Card']
PaperlessBilling ['Yes' 'No']


Unnamed: 0,0
CustomerID,0
Age,0
Gender,0
Tenure,0
MonthlyCharges,0
TotalCharges,0
ContractType,0
InternetService,0
OnlineBackup,0
TechSupport,0
