In [2]:
import pandas as pd
import numpy as np
import io
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- Simulate the "Telecom_Customer_Churn.csv" file ---
# We create a mock CSV in memory to make this script runnable.
# It includes common data quality issues that we will clean.
csv_data = """customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No
9763-GRSKD,Male,0,Yes,Yes,13,Yes,No,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Mailed check,49.95,587.45,No
7469-LKBCI,Male,0,No,No,16,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),18.95,326.8,No
8091-TTVAX,Male,0,Yes,No,58,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,One year,No,Credit card (automatic),100.35,5934.9,No
1212-GLHMD,Male,0,No,no,1,Yes,No,DSL,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,45.7,45.7,yes
0280-XJGEX,Male,0,No,No,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes
5129-JLPIS,Male,0,No,No,25,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,E-check,105.5,2686.05,No
6565-YOLYY,Female,0,Yes,Yes,720,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),1000,73651.8,No
8665-UTGGF,Male,0,Yes,Yes,1,Yes,No,DSL,No,Yes,No,No,No,No,Month-to-month,No,Electronic check,45.25,45.25,Yes
1111-NEVSU,Female,1,Yes,No,25,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,No,No,Month-to-month,Yes,Electronic check,94.0,2343.3,No
2222-MYGNB,Female,0,No,Yes,,Yes,No,DSL,No,Yes,Yes,Yes,No,No,One year,No,Mailed check,62.9,2390.1,No
3333-MKOXR,Male,0,Yes,No,72,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),118.6,8672.45,No
4444-MCHMG,Female,0,Yes,Yes,10,Yes,No,DSL,No,Yes,No,No,Yes,Yes,Month-to-month,Yes,Mailed check,61.45,633.3,No
5555-JASAF,Male,1,No,No,12,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,E-check,74.5,934.35,Yes
"""
# ----------------------------------------------------

# In a real case, you would use this line instead of the code above:
# file_path = "Telecom_Customer_Churn.csv"
# df = pd.read_csv(file_path)

# Task 1: Import the dataset (from the simulated CSV data)
csv_file = io.StringIO(csv_data)
df = pd.read_csv(csv_file)

# Task 2: Explore the dataset
print("--- 2. Initial Data Exploration ---")
print("Shape:", df.shape)
print("\nHead:\n", df.head())
print("\nInfo:")
df.info()
print("\nInitial Missing Values:\n", df.isnull().sum())
print("\nUnique 'Churn' values:", df['Churn'].unique())
print("Unique 'Dependents' values:", df['Dependents'].unique())

# Task 3: Handle missing values
print("\n--- 3. Handling Missing Values ---")
# 'TotalCharges' is object, has missing values. Convert to numeric, coercing errors.
# The mock data has a blank 'tenure' and 'TotalCharges' which pandas reads as NaN.
# 'TotalCharges' is often empty for new customers (low tenure). We'll convert to numeric.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Strategy 1: For 'TotalCharges', NaNs are likely new customers. Fill with 0.
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# Strategy 2: For 'tenure', NaN is less common. We'll drop the row.
df.dropna(subset=['tenure'], inplace=True)

# Strategy 3: For 'Dependents', fill with the mode (most common value).
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])

print("\nMissing Values After Handling:\n", df.isnull().sum())

# Task 4: Remove duplicate records
print("\n--- 4. Removing Duplicates ---")
print(f"Rows before duplicate removal: {len(df)}")
df.drop_duplicates(inplace=True)
print(f"Rows after duplicate removal: {len(df)}")

# Drop customerID, as it's an identifier and not a predictive feature
df.drop('customerID', axis=1, inplace=True)

# Task 5: Standardize inconsistent data
print("\n--- 5. Standardizing Inconsistent Data ---")
# 'Churn' has 'No', 'Yes', 'no', 'yes'
df['Churn'] = df['Churn'].str.title()
print("Cleaned 'Churn' values:", df['Churn'].unique())

# 'Dependents' has 'No', 'Yes', 'no'
df['Dependents'] = df['Dependents'].str.title()
print("Cleaned 'Dependents' values:", df['Dependents'].unique())

# 'PaymentMethod' has 'E-check' and 'Electronic check'
df['PaymentMethod'] = df['PaymentMethod'].replace('E-check', 'Electronic check')
print("Cleaned 'PaymentMethod' values:", df['PaymentMethod'].unique())

# Task 8: Perform feature engineering (Done *before* encoding)
print("\n--- 8. Feature Engineering ---")
# Create 'Total_Services' count
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
# Count 'Yes' for services. 'No internet service' and 'No phone service' count as 0.
df['Total_Services'] = df[service_cols].apply(lambda row: (row == 'Yes').sum(), axis=1)

# Create 'Tenure_per_Charge'
# Add 1 to monthly charges to avoid division by zero
df['Tenure_per_Charge'] = df['tenure'] / (df['MonthlyCharges'] + 1)
print(df[['Total_Services', 'Tenure_per_Charge']].head())

# Task 6: Convert columns to correct data types (and Encoding)
print("\n--- 6. Converting Data Types and Encoding ---")
# Convert binary columns to 0/1
binary_map = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}
df['gender'] = df['gender'].map(binary_map)
df['Partner'] = df['Partner'].map(binary_map)
df['Dependents'] = df['Dependents'].map(binary_map)
df['PhoneService'] = df['PhoneService'].map(binary_map)
df['PaperlessBilling'] = df['PaperlessBilling'].map(binary_map)
df['Churn'] = df['Churn'].map(binary_map)
df['SeniorCitizen'] = df['SeniorCitizen'].astype(int)

# One-Hot Encode multi-category columns
categorical_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                    'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nData types after conversion and encoding:")
df.info()

# Task 7: Identify and handle outliers
print("\n--- 7. Handling Outliers (Capping) ---")
# We'll use the IQR method to cap outliers in numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Total_Services', 'Tenure_per_Charge']

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Cap the outliers
    df[col] = np.clip(df[col], lower_bound, upper_bound)

print("Outliers have been capped to the 1.5 * IQR range.")
print("\nData description after outlier capping:\n", df[numerical_cols].describe())

# Separate features (X) and target (y)
y = df['Churn']
X = df.drop('Churn', axis=1)

# Task 9: Normalize or scale the data
print("\n--- 9. Scaling Data (StandardScaler) ---")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert scaled data back to a DataFrame for easier handling
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
print(X_scaled_df.describe())

# Task 10: Split the dataset into training and testing sets
print("\n--- 10. Splitting Data (80/20 split) ---")
# We use stratify=y to ensure the proportion of churn is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Churn proportion in y_train:\n{y_train.value_counts(normalize=True)}")

# Task 11: Export the cleaned dataset
print("\n--- 11. Exporting Cleaned Dataset ---")
# Create the final cleaned DataFrame (pre-split)
cleaned_df = X_scaled_df.copy()
cleaned_df['Churn'] = y.values # Add the target back

# Export to CSV
cleaned_df.to_csv('Telecom_Churn_Cleaned.csv', index=False)
print("Successfully exported 'Telecom_Churn_Cleaned.csv'")

--- 2. Initial Data Exploration ---
Shape: (24, 21)

Head:
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No     1.0           No   
1  5575-GNVDE    Male              0      No         No    34.0          Yes   
2  3668-QPYBK    Male              0      No         No     2.0          Yes   
3  7795-CFOCW    Male              0      No         No    45.0           No   
4  9237-HQITU  Female              0      No         No     2.0          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  