In [14]:
# Import necessary libraries
import pandas as pd

# Load the dataset
# Assuming the file "WA_Fn-UseC_-Telco-Customer-Churn.csv" is in the project directory
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

# Display the first few rows of the dataset
print("\nFirst few rows of the dataset:")
print(data.head())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  704

In [15]:
# Drop the 'customerID' column as it is not useful for modeling
data = data.drop(columns=['customerID'])

# Display the updated dataset
print("Dataset after dropping 'customerID':")
print(data.head())

Dataset after dropping 'customerID':
   gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  Female              0     Yes         No       1           No   
1    Male              0      No         No      34          Yes   
2    Male              0      No         No       2          Yes   
3    Male              0      No         No      45           No   
4  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-mont

In [16]:
# Convert 'TotalCharges' from object to numeric
# Some values might be non-numeric (e.g., empty strings), so we handle them
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Check for missing values after conversion
print("\nMissing values in 'TotalCharges' after conversion:")
print(data['TotalCharges'].isnull().sum())

# Fill missing values with the median of 'TotalCharges'
data['TotalCharges'] = data['TotalCharges'].fillna(data['TotalCharges'].median())

# Verify the changes
print("\nData types after converting 'TotalCharges':")
print(data.dtypes)


Missing values in 'TotalCharges' after conversion:
11

Data types after converting 'TotalCharges':
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [17]:
# Separate categorical and numerical columns
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("\nCategorical Columns:")
print(categorical_columns)

print("\nNumerical Columns:")
print(numerical_columns)

# Apply One-Hot Encoding to categorical columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Verify the changes
print("\nDataset after One-Hot Encoding:")
print(data.head())


Categorical Columns:
['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

Numerical Columns:
['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

Dataset after One-Hot Encoding:
   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
0              0       1           29.85         29.85        False   
1              0      34           56.95       1889.50         True   
2              0       2           53.85        108.15         True   
3              0      45           42.30       1840.75         True   
4              0       2           70.70        151.65        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1        False           False              True   
2        False           False 

In [18]:
# Convert 'Churn_Yes' column to numeric (1 for Yes, 0 for No)
data['Churn'] = data['Churn_Yes']

# Drop the original 'Churn_Yes' column as it is no longer needed
data = data.drop(columns=['Churn_Yes'])

# Verify the changes
print("\nDataset after converting 'Churn' to numeric:")
print(data['Churn'].value_counts())


Dataset after converting 'Churn' to numeric:
Churn
False    5174
True     1869
Name: count, dtype: int64


In [22]:
# Import the train_test_split function from sklearn
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = data.drop(columns=['Churn'])  # Features (all columns except 'Churn')
y = data['Churn']  # Target (the 'Churn' column)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Verify the split
print("\nNumber of samples in each set:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")


Number of samples in each set:
Training set: 5634 samples
Testing set: 1409 samples


In [23]:
# Import the KNeighborsClassifier from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model with k=5 neighbors
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn.fit(X_train, y_train)

In [24]:
# Predict on the test data
y_pred = knn.predict(X_test)

# Import evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the results
print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


Model Evaluation Metrics:
Accuracy: 0.76
Precision: 0.55
Recall: 0.44
F1-Score: 0.49


In [25]:
print(data['Churn'].value_counts())

Churn
False    5174
True     1869
Name: count, dtype: int64
