In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix



In [2]:
# Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/DSData/master/Telecom_customer_churn.csv'
df = pd.read_csv(url)


In [3]:
# Drop unnecessary columns
df.drop(['customerID', 'gender'], axis=1, inplace=True)



In [4]:
# Handle missing values
df.replace(' ', np.nan, inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)



In [5]:
# Encode categorical columns
cat_cols = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
            'MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport',
            'StreamingTV', 'StreamingMovies', 'Contract',
            'PaperlessBilling', 'PaymentMethod']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [6]:
# Encode target variable
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])


In [7]:
# Separate the features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# Build and train a logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)


LogisticRegression()

In [11]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)


In [12]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [13]:
print("Model Performance Metrics:")
print("Accuracy:", accuracy)
print("Precision:",precision)
print("Recall:", recall)
print("F1 Score:", f1)


Model Performance Metrics:
Accuracy: 0.8197303051809794
Precision: 0.683076923076923
Recall: 0.5951742627345844
F1 Score: 0.6361031518624641


In [14]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Create a DataFrame to display the predicted and actual churn values
churn_predictions = pd.DataFrame({'Actual Churn': y_test, 'Predicted Churn': y_pred})

# Display the churn predictions
print(churn_predictions)


      Actual Churn  Predicted Churn
185              1                1
2715             0                0
3825             0                0
1807             1                1
132              0                0
...            ...              ...
6366             0                0
315              0                0
2439             0                0
5002             0                0
1161             1                0

[1409 rows x 2 columns]
