In [4]:
import pandas as pd
import os

# raw_data_dir = os.path.join(os.path.dirname(__file__), '../data/raw')
df = pd.read_csv("data/raw/telco_churn.csv")


In [6]:
print("Checking columns for non-numeric data...")
problem_columns = []

# Iterate through all columns in the DataFrame
for col in df.columns:
    # Try to convert the column to a numeric type
    try:
        pd.to_numeric(df[col])
    except ValueError:
        # If a ValueError is raised, it means the column contains non-numeric data
        # Store the column name and continue
        problem_columns.append(col)


if problem_columns:
    print("\nProblematic columns with non-numeric data:")
    for col in problem_columns:
        print(f"- {col}")
else:
    print("\nNo problematic columns found. The issue might be in the profiling tool itself.")


Checking columns for non-numeric data...



Problematic columns with non-numeric data:
- customerID
- gender
- Partner
- Dependents
- PhoneService
- MultipleLines
- InternetService
- OnlineSecurity
- OnlineBackup
- DeviceProtection
- TechSupport
- StreamingTV
- StreamingMovies
- Contract
- PaperlessBilling
- PaymentMethod
- TotalCharges
- Churn


In [9]:
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
5723,7824-PANSQ,Male,0,No,No,58,Yes,Yes,Fiber optic,No,...,No,Yes,No,No,Month-to-month,No,Bank transfer (automatic),80.65,4807.35,No
6161,4343-EJVQB,Male,0,No,No,7,Yes,No,Fiber optic,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,74.35,533.6,No
3838,7714-YXSMB,Female,0,No,No,26,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),100.5,2599.95,No
766,5175-WLYXL,Male,0,No,No,22,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,78.85,1600.25,No
183,5524-KHNJP,Male,0,Yes,Yes,33,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,One year,No,Credit card (automatic),74.75,2453.3,No


In [12]:
df = df.convert_dtypes()

In [13]:
df.dtypes

customerID          string[python]
gender              string[python]
SeniorCitizen                Int64
Partner             string[python]
Dependents          string[python]
tenure                       Int64
PhoneService        string[python]
MultipleLines       string[python]
InternetService     string[python]
OnlineSecurity      string[python]
OnlineBackup        string[python]
DeviceProtection    string[python]
TechSupport         string[python]
StreamingTV         string[python]
StreamingMovies     string[python]
Contract            string[python]
PaperlessBilling    string[python]
PaymentMethod       string[python]
MonthlyCharges             Float64
TotalCharges        string[python]
Churn               string[python]
dtype: object

In [14]:
print("Checking columns for non-numeric data...")
problem_columns = []

# Iterate through all columns in the DataFrame
for col in df.columns:
    # Try to convert the column to a numeric type
    try:
        pd.to_numeric(df[col])
    except ValueError:
        # If a ValueError is raised, it means the column contains non-numeric data
        # Store the column name and continue
        problem_columns.append(col)


if problem_columns:
    print("\nProblematic columns with non-numeric data:")
    for col in problem_columns:
        print(f"- {col}")
else:
    print("\nNo problematic columns found. The issue might be in the profiling tool itself.")


Checking columns for non-numeric data...

Problematic columns with non-numeric data:
- customerID
- gender
- Partner
- Dependents
- PhoneService
- MultipleLines
- InternetService
- OnlineSecurity
- OnlineBackup
- DeviceProtection
- TechSupport
- StreamingTV
- StreamingMovies
- Contract
- PaperlessBilling
- PaymentMethod
- TotalCharges
- Churn


In [26]:
df.loc[df['customerID']=='2775-SEFEE']['TotalCharges'].iloc[0]

np.float64(0.0)

In [21]:
df['TotalCharges'] = df['TotalCharges'].str.strip()

In [25]:
df['TotalCharges'] = df['TotalCharges'].str.strip()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

df = df.fillna({'TotalCharges': 0})

In [27]:
df.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: Int64

In [None]:
dvc remote add -d gdrive_remote /content/drive/MyDrive/my_churn_project/dvc_storage





In [None]:
#!/bin/bash

# Exit immediately if a command exits with a non-zero status.
set -e

# --- Initial Setup (Run Once Manually) ---
# dvc init
# git add .dvc .gitignore && git commit -m "Initialize DVC"

# --- Ingest Raw Data ---
echo "--- Running Data Ingestion ---"
python scripts/ingest.py
dvc add data/raw/telco_churn.csv data/raw/hf_bank_customer_support.csv
git add data/raw/telco_churn.csv.dvc data/raw/hf_bank_customer_support.csv.dvc
git commit -m "Version raw ingested data"
dvc push
echo "--- Data Ingestion Complete ---"
echo ""

# --- Validate Data ---
echo "--- Running Data Validation ---"
python scripts/validate.py
dvc add data/validation_report.csv
git add data/validation_report.csv.dvc
git commit -m "Version validation report"
dvc push
echo "--- Data Validation Complete ---"
echo ""

# --- Prepare Data ---
echo "--- Running Data Preparation ---"
python scripts/prepare.py
dvc add data/prepared/customer_data_cleaned.csv
git add data/prepared/customer_data_cleaned.csv.dvc
git commit -m "Version prepared data"
dvc push
echo "--- Data Preparation Complete ---"
echo ""

# --- Transform Features ---
echo "--- Running Feature Transformation ---"
python scripts/transform.py
dvc add data/processed/customer_features.db
git add data/processed/customer_features.db.dvc
git commit -m "Version transformed features"
dvc push
echo "--- Feature Transformation Complete ---"
echo ""

# --- Train Model ---
echo "--- Running Model Training ---"
python scripts/model_training.py --db-path data/processed/customer_features.db
dvc add models/model.pkl
git add models/model.pkl.dvc
git commit -m "Version trained model"
dvc push
echo "--- Model Training Complete ---"
echo ""
