In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import re
import difflib
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


In [2]:
df = pd.read_csv('train.csv')
df
df['PaymentMethod'].isnull().groupby(df['SubscriptionType']).mean()


SubscriptionType
Basic       0.0
Premium     0.0
Standard    0.0
Name: PaymentMethod, dtype: float64

In [3]:
null_counts = df.isnull().sum()
print(null_counts)




AccountAge                  0
MonthlyCharges              0
TotalCharges                0
SubscriptionType            0
PaymentMethod               0
PaperlessBilling            0
ContentType                 0
MultiDeviceAccess           0
DeviceRegistered            0
ViewingHoursPerWeek         0
AverageViewingDuration      0
ContentDownloadsPerMonth    0
GenrePreference             0
UserRating                  0
SupportTicketsPerMonth      0
Gender                      0
WatchlistSize               0
ParentalControl             0
SubtitlesEnabled            0
CustomerID                  0
Churn                       0
dtype: int64


In [4]:


# reg = linear_model.LinearRegression()
# reg.fit(df[['']])

df
print(df.dtypes)



AccountAge                    int64
MonthlyCharges              float64
TotalCharges                float64
SubscriptionType             object
PaymentMethod                object
PaperlessBilling             object
ContentType                  object
MultiDeviceAccess            object
DeviceRegistered             object
ViewingHoursPerWeek         float64
AverageViewingDuration      float64
ContentDownloadsPerMonth      int64
GenrePreference              object
UserRating                  float64
SupportTicketsPerMonth        int64
Gender                       object
WatchlistSize                 int64
ParentalControl              object
SubtitlesEnabled             object
CustomerID                   object
Churn                         int64
dtype: object


In [5]:
# Which categorical columns are blowing up?
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cardinality = df[cat_cols].nunique(dropna=True).sort_values(ascending=False)
print(cardinality.head(20))  # top offenders

CustomerID           243787
GenrePreference           5
PaymentMethod             4
DeviceRegistered          4
SubscriptionType          3
ContentType               3
PaperlessBilling          2
MultiDeviceAccess         2
Gender                    2
ParentalControl           2
SubtitlesEnabled          2
dtype: int64


In [6]:
df_new = df.drop('CustomerID', axis=1)
df_new


Unnamed: 0,AccountAge,MonthlyCharges,TotalCharges,SubscriptionType,PaymentMethod,PaperlessBilling,ContentType,MultiDeviceAccess,DeviceRegistered,ViewingHoursPerWeek,AverageViewingDuration,ContentDownloadsPerMonth,GenrePreference,UserRating,SupportTicketsPerMonth,Gender,WatchlistSize,ParentalControl,SubtitlesEnabled,Churn
0,20,11.055215,221.104302,Premium,Mailed check,No,Both,No,Mobile,36.758104,63.531377,10,Sci-Fi,2.176498,4,Male,3,No,No,0
1,57,5.175208,294.986882,Basic,Credit card,Yes,Movies,No,Tablet,32.450568,25.725595,18,Action,3.478632,8,Male,23,No,Yes,0
2,73,12.106657,883.785952,Basic,Mailed check,Yes,Movies,No,Computer,7.395160,57.364061,23,Fantasy,4.238824,6,Male,1,Yes,Yes,0
3,32,7.263743,232.439774,Basic,Electronic check,No,TV Shows,No,Tablet,27.960389,131.537507,30,Drama,4.276013,2,Male,24,Yes,Yes,0
4,57,16.953078,966.325422,Premium,Electronic check,Yes,TV Shows,No,TV,20.083397,45.356653,20,Comedy,3.616170,4,Female,0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243782,77,9.639902,742.272460,Basic,Mailed check,No,Movies,No,Computer,13.502729,80.367312,47,Sci-Fi,3.697451,1,Male,8,Yes,No,0
243783,117,13.049257,1526.763053,Premium,Credit card,No,TV Shows,Yes,TV,24.963291,59.818441,35,Comedy,1.449742,4,Male,20,No,No,0
243784,113,14.514569,1640.146267,Premium,Credit card,Yes,TV Shows,No,TV,10.628728,176.186095,44,Action,4.012217,6,Male,13,Yes,Yes,0
243785,7,18.140555,126.983887,Premium,Bank transfer,Yes,TV Shows,No,TV,30.466782,153.386315,36,Fantasy,2.135789,7,Female,5,No,Yes,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Churn']), 
                                                    df['Churn'], 
                                                    test_size=0.2, 
                                                    random_state=42)

X_train
y_train

66000     0
133065    1
210564    0
156946    0
125381    0
         ..
119879    0
103694    0
131932    0
146867    1
121958    0
Name: Churn, Length: 195029, dtype: int64

In [None]:
# --- Imports ---
import re
import difflib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# df = pd.read_csv("your_file.csv")  # <-- make sure df is defined

# --- 0) Set your intended target name here ---
INTENDED_TARGET = "Churn"  # <-- change if needed

# --- 1) Diagnose columns ---
print("Raw columns:", list(df.columns))
print("n_rows, n_cols =", df.shape)

# --- 2) Clean column names (trim/normalize spaces) ---
def clean_name(c):
    if isinstance(c, str):
        c = c.replace("\xa0", " ")
        c = c.strip()
        c = re.sub(r"\s+", " ", c)
    return c

df = df.rename(columns=clean_name)

# --- 3) Resolve the target column robustly ---
if INTENDED_TARGET in df.columns:
    TARGET = INTENDED_TARGET
else:
    lower_map = {c.lower(): c for c in df.columns if isinstance(c, str)}
    if INTENDED_TARGET.lower() in lower_map:
        TARGET = lower_map[INTENDED_TARGET.lower()]
    else:
        close = difflib.get_close_matches(INTENDED_TARGET, df.columns, n=5, cutoff=0.6)
        raise KeyError(f"Target '{INTENDED_TARGET}' not found. Closest matches: {close}")

print("Using target column:", TARGET)

# --- 4) Features/target ---
X = df.drop(columns=[TARGET], errors='raise')
y = df[TARGET]

# --- 5) Identify numeric & categorical predictors from X only ---
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns
print(f"{len(num_cols)} numeric, {len(cat_cols)} categorical feature columns.")

# --- 6) Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 7) Pipelines ---
numeric_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encode', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocess = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
], remainder='drop')

model = Pipeline([
    ('preprocess', preprocess),
    ('reg', LinearRegression())
])

# --- 8) Fit the model ---
model.fit(X_train, y_train)

# --- 9) Evaluate (AFTER fit) ---
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))   # version-agnostic RMSE

print("R²:", r2)
print("RMSE:", rmse)
#

Raw columns: ['AccountAge', 'MonthlyCharges', 'TotalCharges', 'SubscriptionType', 'PaymentMethod', 'PaperlessBilling', 'ContentType', 'MultiDeviceAccess', 'DeviceRegistered', 'ViewingHoursPerWeek', 'AverageViewingDuration', 'ContentDownloadsPerMonth', 'GenrePreference', 'UserRating', 'SupportTicketsPerMonth', 'Gender', 'WatchlistSize', 'ParentalControl', 'SubtitlesEnabled', 'CustomerID', 'Churn']
n_rows, n_cols = (243787, 21)
Using target column: Churn
9 numeric, 11 categorical feature columns.
R²: 0.11883745384679734
RMSE: 0.36085504170348504


