In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# 1. Import the "Telecom_Customer_Churn.csv" dataset
# Purpose: To load the dataset into a pandas DataFrame for analysis
df = pd.read_csv("telecom_customer_churn.csv")

# 2. Explore the dataset to understand its structure and content
# Purpose: To get a sense of the columns, data types, missing values, and basic statistics
print("Columns : ",df.columns)
# print(df.head())  # Display first few rows
print(df.info())  # Get summary of the dataframe (column names, data types, non-null counts)
print(df.describe())  # Get summary statistics for numerical columns

# 3. Handle missing values in the dataset, deciding on an appropriate strategy
# Purpose: To fill or drop missing values in a way that does not bias the analysis
# If a column has missing values, decide whether to drop rows or fill with mean/median/mode
imputer = SimpleImputer(strategy="mean")  # Fill missing numerical values with the mean
df['Avg Monthly Long Distance Charges'] = imputer.fit_transform(df[['Avg Monthly Long Distance Charges']])
df['Total Charges'] = imputer.fit_transform(df[['Total Charges']])

# For categorical columns, we can fill missing values with the most frequent value (mode)
imputer_cat = SimpleImputer(strategy="most_frequent")
df['Offer'] = imputer_cat.fit_transform(df[['Offer']])

# 4. Remove any duplicate records from the dataset
# Purpose: To remove exact duplicate rows that do not add any value
df.drop_duplicates(inplace=True)  # Remove duplicates in place

# 5. Check for inconsistent data, such as inconsistent formatting or spelling variations, and standardize it
# Purpose: To standardize categorical variables (e.g., 'Yes', 'No' to 'yes', 'no')
df['Gender'] = df['Gender'].str.lower()  # Standardize gender to lowercase
df['Married'] = df['Married'].str.lower().map({'yes': 1, 'no': 0})  # Map yes/no to 1/0 for Married
df['Phone Service'] = df['Phone Service'].str.lower()  # Standardize 'phone service'

# 6. Convert columns to the correct data types as needed
# Purpose: To ensure that columns have the correct data types for analysis
df['Tenure in Months'] = df['Tenure in Months'].astype(int)  # Convert tenure to integer
df['Monthly Charge'] = df['Monthly Charge'].astype(float)  # Convert charge to float
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')  # Handle non-numeric values in 'Total Charges'

# 7. Identify and handle outliers in the data
# Purpose: To detect and handle outliers that can skew the analysis
# For simplicity, let's use z-score to detect outliers in numerical columns
from scipy import stats
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64'])))  # Only numerical columns
df_no_outliers = df[(z_scores < 3).all(axis=1)]  # Keep rows where all numerical columns have z-score < 3

# 8. Perform feature engineering, creating new features that may be relevant to predicting customer churn
# Purpose: To create new features that may help predict churn, e.g., 'Average Monthly Spend'
df['Average Monthly Spend'] = df['Total Charges'] / df['Tenure in Months']  # New feature: Average Spend per Month

# 9. Normalize or scale the data if necessary
# Purpose: To normalize numerical data so that features have similar scales, improving the performance of algorithms
scaler = StandardScaler()
df_scaled = df.copy()  # Create a copy to scale
df_scaled[['Avg Monthly Long Distance Charges', 'Avg Monthly GB Download', 'Monthly Charge']] = scaler.fit_transform(
    df[['Avg Monthly Long Distance Charges', 'Avg Monthly GB Download', 'Monthly Charge']]
)

# Final cleaned and prepared dataset
# print(df_scaled.head())  # Show the first few rows of the cleaned and scaled dataset

# 10. Split the dataset into training and testing sets for further analysis
# Purpose: To split the data into two sets, one for training a model and another for testing its performance
from sklearn.model_selection import train_test_split

# Define the features (X) and target variable (y)
X = df_scaled.drop(columns=['Churn Category', 'Customer Status'])  # Features (excluding target columns)
y = df_scaled['Churn Category']  # Target variable (Churn Category)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the training and testing sets
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# 11. Export the cleaned dataset for future analysis or modeling
# Purpose: To save the cleaned data into a CSV file for future use
df_scaled.to_csv("cleaned_telecom_customer_churn.csv", index=False)

# Confirm that the dataset has been exported
print("Cleaned dataset has been saved as 'cleaned_telecom_customer_churn.csv'.")



Columns :  Index(['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents',
       'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals',
       'Tenure in Months', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                

In [6]:
df.columns

Index(['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents',
       'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals',
       'Tenure in Months', 'Offer', 'Phone Service',
       'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason',
       'Average Monthly Spend'],
      dtype='object')

In [14]:
df['Phone Service']

0       yes
1       yes
2       yes
3       yes
4       yes
       ... 
7038    yes
7039    yes
7040    yes
7041    yes
7042     no
Name: Phone Service, Length: 7043, dtype: object