In [1]:
import pandas as pd

# 1. Load the Training Data (The 80% file)
train_df = pd.read_csv('churn-bigml-80.csv')

# 2. Load the Testing Data (The 20% file)
test_df = pd.read_csv('churn-bigml-20.csv')

# 3. Check the shapes (Rows, Columns)
print(f"Training Data Shape: {train_df.shape}")
print(f"Testing Data Shape:  {test_df.shape}")

# 4. Peek at the data to see the column names
print("\n--- First 5 Rows of Training Data ---")
display(train_df.head())

# 5. Verify the Target Column
# We need to know which column tells us if they left (usually 'Churn' or 'Exited')
print("\n--- Columns List ---")
print(train_df.columns.tolist())

Training Data Shape: (2666, 20)
Testing Data Shape:  (667, 20)

--- First 5 Rows of Training Data ---


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False



--- Columns List ---
['State', 'Account length', 'Area code', 'International plan', 'Voice mail plan', 'Number vmail messages', 'Total day minutes', 'Total day calls', 'Total day charge', 'Total eve minutes', 'Total eve calls', 'Total eve charge', 'Total night minutes', 'Total night calls', 'Total night charge', 'Total intl minutes', 'Total intl calls', 'Total intl charge', 'Customer service calls', 'Churn']


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# 1. Load the Datasets
train_df = pd.read_csv('churn-bigml-80.csv')
test_df = pd.read_csv('churn-bigml-20.csv')

# 2. Preprocessing: Convert Text/Booleans to Numbers
# We loop through columns to find any text (object) or boolean (bool) columns and encode them
# (This handles 'State', 'International plan', 'Voice mail plan', and the target 'Churn')
label_encoder = LabelEncoder()

for col in train_df.columns:
    if train_df[col].dtype == 'object' or train_df[col].dtype == 'bool':
        # Fit on train, transform both train and test to keep them consistent
        train_df[col] = label_encoder.fit_transform(train_df[col])
        test_df[col] = label_encoder.transform(test_df[col])

# 3. Separate Features (X) and Target (y)
X_train = train_df.drop('Churn', axis=1)
y_train = train_df['Churn']

X_test = test_df.drop('Churn', axis=1)
y_test = test_df['Churn']

# 4. Train Logistic Regression
# (max_iter=1000 ensures it runs long enough to find the answer)
log_model = LogisticRegression(max_iter=2000) 
log_model.fit(X_train, y_train)

# 5. Evaluate
predictions = log_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n--- Confusion Matrix (Actual vs Predicted) ---")
print(confusion_matrix(y_test, predictions))
print("\n--- Classification Report ---")
print(classification_report(y_test, predictions))

Model Accuracy: 85.91%

--- Confusion Matrix (Actual vs Predicted) ---
[[557  15]
 [ 79  16]]

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       572
           1       0.52      0.17      0.25        95

    accuracy                           0.86       667
   macro avg       0.70      0.57      0.59       667
weighted avg       0.82      0.86      0.83       667



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
