In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style("whitegrid")
plt.style.use("seaborn-v0_8-whitegrid")

In [None]:
try:
    df = pd.read_csv('C:/Customer_Churn_Prediction/Data/telco_churn_to_predict.csv', encoding='utf-8')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: The file 'telco_churn_to_predict.csv' was not found.")
    print("Please make sure the file is in a folder named 'Data' in your project directory.")
    df = None

In [None]:
if df is not None:
    print("\nFirst 5 Rows of the Data Set:")
    print(df.head())
    
    print("\nData Information:")
    df.info()

In [None]:
df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')
df.dropna(subset=['total_charges'], inplace=True)
df.reset_index(drop=True, inplace=True)
print("\nDataFrame information after converting 'total_charges':")
df.info()
print(f"\nNumber of rows after dropping missing values: {len(df)}")

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
cols_to_encode = [col for col in categorical_cols if col not in ['customer_id', 'churn_label']]
df_encoded = pd.get_dummies(df, columns=cols_to_encode, drop_first=True, dtype=int)
print(f"\nDataFrame shape after one-hot encoding: {df_encoded.shape}")
print("\nFirst 5 rows of the dataset after one-hot encoding:")
print(df_encoded.head())

In [None]:
churn_counts = df_encoded['churn_label'].value_counts()
churn_percentages = df_encoded['churn_label'].value_counts(normalize=True) * 100
print("\nDistribution of Churn Labels:")
print("0 = No Churn")
print("1 = Churn")
print(churn_counts)
print("\nPercentage of Churn Labels:")
print(churn_percentages.round(2))

plt.figure(figsize=(6, 4))
churn_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Churn Labels')
plt.xlabel('Churn Label (0 = No, 1 = Yes)')
plt.ylabel('Number of Customers')
plt.xticks(ticks=[0, 1], labels=['No Churn', 'Churn'], rotation=0)
plt.show()

In [None]:
X = df_encoded.drop(columns=['customer_id', 'churn_label'])

y = df_encoded['churn_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Verify the split
print("\n--- Data Split Summary ---")
print("Original DataFrame shape:", df_encoded.shape)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)