In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [3]:
df = pd.read_csv("../data/processed/telco_clean.csv")

In [5]:
X = df.drop(['customerID', 'MonthlyCharges','Churn'],axis =1)
y = df['MonthlyCharges']

In [8]:
cat_cols = X.select_dtypes(include=["object"]).columns
print(cat_cols)
num_cols = X.select_dtypes(exclude=["object"]).columns
print(num_cols)

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')
Index(['SeniorCitizen', 'tenure', 'TotalCharges'], dtype='object')


In [9]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(),num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

In [10]:
pipe = Pipeline([
    ("preprocess", preprocessor),
    ("model", Ridge(alpha=1.0))
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.2,
                                                    random_state =42
                                                   )

In [13]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [21]:
print("MSE:", mean_squared_error(y_test,y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MSE: 1.075837078099733
R2 Score: 0.9987826768926218


In [22]:
joblib.dump(pipe, "../models/regression.pkl")

['../models/regression.pkl']