## Importing essential libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_rows', None)  # Show all rows


## Loading dataset

In [3]:
df=pd.read_csv("Telco_Customer_Churn_Dataset.csv")

In [4]:

# Strip extra spaces from column names
df.columns = df.columns.str.strip()

In [5]:
# Convert all column names to lowercase
df.columns = df.columns.str.lower()

In [6]:
df.sample()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
6578,3898-BSJYF,Female,0,No,Yes,10,Yes,Yes,DSL,Yes,...,Yes,Yes,No,Yes,One year,No,Credit card (automatic),73.55,693.3,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 


- This column (TotalCharges) should be numeric, but it’s of type object (i.e., string). This happens because some values are empty strings like "", especially when tenure = 0.

In [8]:
# errors='coerce' converts invalid entries (like empty strings) to NaN.
df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')

In [9]:
df.isnull().sum()

customerid           0
gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
multiplelines        0
internetservice      0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
paperlessbilling     0
paymentmethod        0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [10]:
# These missing rows usually correspond to new customers with tenure = 0, so dropping them is safe.
df.dropna(subset=['totalcharges'], inplace=True)

In [11]:
# Mapped 'Yes' to 1 and 'No' to 0 for SeniorCitizen
df['seniorcitizen'] = df['seniorcitizen'].map({ 0 : "No", 1 : 'Yes'})

In [12]:
# Drop customerID (not useful for modeling)
df.drop(columns=['customerid'], inplace=True)

In [13]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

## Split the Data

In [14]:
X = df.drop(columns="churn")
y = df.churn.copy()

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5625, 19) (5625,)
(1407, 19) (1407,)


## Export the Subsets

In [17]:
def export_data(X, y, name):
	file_name = f"{name}.csv"
	file_path = os.path.join("data", file_name)

	X.join(y).to_csv(file_path, index=False)

	return pd.read_csv(file_path).head() 

In [18]:
train=export_data(X_train, y_train, "train")
train

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Female,No,No,No,43,No,No phone service,DSL,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),49.05,2076.2,Yes
1,Male,No,No,No,3,Yes,No,DSL,No,No,No,No,Yes,No,Month-to-month,No,Credit card (automatic),53.4,188.7,Yes
2,Female,No,Yes,No,55,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,77.75,4458.15,Yes
3,Male,No,Yes,Yes,45,Yes,No,DSL,Yes,No,No,Yes,No,No,Two year,No,Bank transfer (automatic),54.65,2553.7,No
4,Female,No,Yes,Yes,55,Yes,Yes,Fiber optic,Yes,No,No,No,Yes,Yes,One year,No,Mailed check,100.9,5448.6,No


In [19]:
export_data(X_test, y_test, "test")

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,Male,Yes,Yes,No,61,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),25.0,1501.75,No
1,Female,No,No,No,19,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),24.7,465.85,No
2,Male,No,Yes,No,13,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),102.25,1359.0,Yes
3,Male,No,Yes,No,37,Yes,Yes,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Mailed check,55.05,2030.75,No
4,Female,No,No,No,6,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Bank transfer (automatic),29.45,161.45,No


📦 Data Cleaning Steps:

1. Loaded the Telco Customer Churn dataset into a DataFrame.
2. Removed extra spaces from column names to make them clean.
3. Converted all column names to lowercase to keep them consistent.
4. Changed the "totalcharges" column to numeric because it had some non-numeric entries. 
   - If there were invalid entries, they were turned into NaN (missing values).
5. Mapped 'Yes' to 1 and 'No' to 0 for SeniorCitizen.
6. Checked for missing values.
7. Dropped rows where "totalcharges" was missing because those rows mostly had tenure = 0 (new customers who didn't generate charges yet).
8. Dropped the "customerid" column because it is just an ID and doesn't help in predicting churn.
9.  Separated the data into features (X) and target (y) where "churn" is the target variable.
10. Split the data into training and testing sets using an 80-20 split.
11. Saved the training and testing sets as "train.csv" and "test.csv" inside the "data" folder for further use.
