In [1]:
# Logistic Regression
import pandas as pd
import numpy as np


df = pd.read_csv('train.csv')
df.dtypes

df = df.drop('CustomerID', axis=1)
df

Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,63.531377,10,Sci-Fi,2.176498,4,Male,3,No,No,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,25.725595,18,Action,3.478632,8,Male,23,No,Yes,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.395160,57.364061,23,Fantasy,4.238824,6,Male,1,Yes,Yes,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,131.537507,30,Drama,4.276013,2,Male,24,Yes,Yes,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,45.356653,20,Comedy,3.616170,4,Female,0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243782,77,9.639902,742.272460,Basic,Mailed check,No,Movies,No,Computer,13.502729,80.367312,47,Sci-Fi,3.697451,1,Male,8,Yes,No,0
243783,117,13.049257,1526.763053,Premium,Credit card,No,TV Shows,Yes,TV,24.963291,59.818441,35,Comedy,1.449742,4,Male,20,No,No,0
243784,113,14.514569,1640.146267,Premium,Credit card,Yes,TV Shows,No,TV,10.628728,176.186095,44,Action,4.012217,6,Male,13,Yes,Yes,0
243785,7,18.140555,126.983887,Premium,Bank transfer,Yes,TV Shows,No,TV,30.466782,153.386315,36,Fantasy,2.135789,7,Female,5,No,Yes,0


In [6]:





from sklearn.linear_model import LogisticRegression



# --- adjust these ---
TARGET = "Churn"        # <-- set this to your target column name
train_df = df      # <-- ensure this exists
# ---------------------

# Separate features/target from the *train* set only
y_train = train_df[TARGET]
X_train = train_df.drop(columns=[TARGET])

# Sanity-check target balance in TRAIN
print("\n=== Target balance (TRAIN) ===")
print((y_train.value_counts(normalize=True) * 100).round(2).astype(str) + "%")

# Identify categorical columns in TRAIN features
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
print("\nCategorical columns:", cat_cols)

# Class balance per categorical column (overall)
for col in cat_cols:
    vc = X_train[col].value_counts(dropna=False)
    pct = (vc / len(X_train) * 100).round(2)
    out = pd.DataFrame({"Count": vc, "Percent": pct})
    print(f"\n=== Class balance for: {col} (TRAIN) ===")
    print(out)

# Optional: class balance *within target classes* (useful for leakage/imbalance checks)
for col in cat_cols:
    ct = pd.crosstab(X_train[col], y_train, dropna=False, normalize="columns") * 100
    print(f"\n=== {col} by {TARGET} (%) — column-normalized ===")
    print(ct.round(2))


logit = LogisticRegression(
    max_iter=500,
    class_weight="balanced",   # remove if classes are already balanced
    solver="liblinear"         # supports L1/L2; good small/medium data
)


=== Target balance (TRAIN) ===
Churn
0    81.88%
1    18.12%
Name: proportion, dtype: object

Categorical columns: ['SubscriptionType', 'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess', 'DeviceRegistered', 'GenrePreference', 'Gender', 'ParentalControl', 'SubtitlesEnabled']

=== Class balance for: SubscriptionType (TRAIN) ===
                  Count  Percent
SubscriptionType                
Standard          81920    33.60
Basic             81050    33.25
Premium           80817    33.15

=== Class balance for: PaymentMethod (TRAIN) ===
                  Count  Percent
PaymentMethod                   
Electronic check  61313    25.15
Credit card       60924    24.99
Bank transfer     60797    24.94
Mailed check      60753    24.92

=== Class balance for: PaperlessBilling (TRAIN) ===
                   Count  Percent
PaperlessBilling                 
No                121980    50.04
Yes               121807    49.96

=== Class balance for: ContentType (TRAIN) ===

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

num_cols = X_train.select_dtypes(include=['number']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()

numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(
        drop="first",                 # avoid perfect multicollinearity
        handle_unknown="ignore",
        min_frequency=10              # collapse rare levels; adjust as needed
    ))
])

preprocess = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols)
])


In [7]:
clf = Pipeline([
    ("prep", preprocess),
    ("model", logit)
])

clf.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,10
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'liblinear'
,max_iter,500


In [None]:
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    RocCurveDisplay, PrecisionRecallDisplay
)
import matplotlib.pyplot as plt


test_df = pd.read_csv("test.csv")

# X_test is simply all columns in the test file
X_test = test_df.copy()

print(X_test.head())
print("Shape:", X_test.shape)

# Probabilities + labels
y_proba = clf.predict_proba(x_test)[:, 1]
y_pred = clf.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

RocCurveDisplay.from_predictions(y_test, y_proba)
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, y_proba)
plt.show()


NameError: name 'x_test' is not defined