# Table of Contents

1. [Stage 1: Imports and Database Details](#Stage-1:-Imports-and-Database-Details)
2. [Stage 2: Fetch Data](#Stage-2:-Fetch-Data)
3. [Stage 3: ETL and Data Cleaning](#Stage-3:-ETL-and-Data-Cleaning)
4. [Stage 4: Prediction](#Stage-4:-Prediction)
5. [Stage 5: Data export](#Stage-5:-Data-export)

## Stage 1: Imports and Database Details

In [27]:
# ===============================
# Environment setup
# ===============================
import sys
import subprocess

def install_if_missing(packages):
    for package in packages:
        try:
            __import__(package)
        except ImportError:
            print(f"Installing missing package: {package}")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = [
    "pandas", "pymongo", "joblib", "json"
]

install_if_missing(required_packages)

mongo_db_host_port = "mongodb://localhost:27017/"
mongo_db_database_name = "Tech"
mongo_db_collection_name = "Customers"

## Stage 2: Fetch Data

In [28]:
# ===============================
# Connect to MongoDB and Load the Data
# ===============================
from pymongo import MongoClient
import pandas as pd

# MongoDB connection string (replace with your actual details)
client = MongoClient(mongo_db_host_port)
mongoDB = client[mongo_db_database_name]
customersCollection = mongoDB[mongo_db_collection_name]

# Get all documents, excluding the _id field
cursor = customersCollection.find({}, {"_id": 0})

# Load data from MongoDB into a DataFrame
dataForPrediction = pd.DataFrame(list(cursor))

# Drop MongoDB’s default _id field (optional)
if '_id' in dataForPrediction.columns:
    dataForPrediction.drop(columns=['_id'], inplace=True)
dataForPredictionOriginal = dataForPrediction.copy(deep=True)
dataForPrediction.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Services
0,id7044,Male,0,Yes,No,3.88,Month-to-month,[No],Electronic check,74.82,184.82,"{'PhoneService': 'Yes', 'MultipleLines': 'No',..."
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,[Yes],Credit card (automatic),40.301502,1359.7,"{'PhoneService': 'No', 'MultipleLines': 'Yes',..."
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,[Yes],Mailed check,59.778062,1752.55,"{'PhoneService': 'Yes', 'MultipleLines': 'No',..."
3,id7047,Male,0,No,No,1.79,Month-to-month,[Yes],Electronic check,100.46,493.84,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
4,id7048,Female,0,No,Yes,5.58,Month-to-month,[Yes],Electronic check,74.88,74.66,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."


In [29]:
columns = dataForPrediction.columns.tolist()
columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Services']

## Stage 3: ETL and Data Cleaning

In [30]:
sample_rows = dataForPrediction['Services'].sample(5, random_state=1)
for i, row in enumerate(sample_rows):
     print(f"\n", row)


 {'PhoneService': 'No', 'MultipleLines': 'Yes', 'InternetService': 'No', 'OnlineSecurity': 'Yes', 'OnlineBackup': 'No', 'DeviceProtection': 'No internet service', 'TechSupport': 'Yes', 'StreamingTV': 'Yes', 'StreamingMovies': 'Yes'}

 {'PhoneService': 'Yes', 'MultipleLines': 'No', 'InternetService': 'DSL', 'OnlineSecurity': 'No', 'OnlineBackup': 'No', 'DeviceProtection': 'No', 'TechSupport': 'Yes', 'StreamingTV': 'No', 'StreamingMovies': 'No'}

 {'PhoneService': 'No', 'MultipleLines': 'No', 'InternetService': 'DSL', 'OnlineSecurity': 'No internet service', 'OnlineBackup': 'No', 'DeviceProtection': 'Yes', 'TechSupport': 'No', 'StreamingTV': 'No internet service', 'StreamingMovies': 'No internet service'}

 {'PhoneService': 'Yes', 'MultipleLines': 'No', 'InternetService': 'Fiber optic', 'OnlineSecurity': 'Yes', 'OnlineBackup': 'Yes', 'DeviceProtection': 'No internet service', 'TechSupport': 'No', 'StreamingTV': 'No', 'StreamingMovies': 'No internet service'}

 {'PhoneService': 'Yes', 'M

In [31]:
dataForPrediction['Services'] = dataForPrediction['Services'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
services_df = dataForPrediction['Services'].apply(pd.Series)
dataForPrediction = pd.concat([dataForPrediction.drop(columns=['Services']), services_df], axis=1)
dataForPrediction.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,3.88,Month-to-month,[No],Electronic check,74.82,184.82,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,[Yes],Credit card (automatic),40.301502,1359.7,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,[Yes],Mailed check,59.778062,1752.55,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,1.79,Month-to-month,[Yes],Electronic check,100.46,493.84,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7048,Female,0,No,Yes,5.58,Month-to-month,[Yes],Electronic check,74.88,74.66,Yes,Yes,DSL,No internet service,No,No,Yes,No,Yes


In [32]:
dataForPrediction['PaperlessBilling'] = dataForPrediction['PaperlessBilling'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
dataForPrediction.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,3.88,Month-to-month,No,Electronic check,74.82,184.82,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,Yes,Mailed check,59.778062,1752.55,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,1.79,Month-to-month,Yes,Electronic check,100.46,493.84,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7048,Female,0,No,Yes,5.58,Month-to-month,Yes,Electronic check,74.88,74.66,Yes,Yes,DSL,No internet service,No,No,Yes,No,Yes


In [33]:
dataForPrediction = dataForPrediction.replace(r'^\s*$', pd.NA, regex=True)
dataForPrediction = dataForPrediction.dropna()
customerInfo=pd.read_csv("churn.csv")
commonColumns = dataForPrediction.columns.intersection(customerInfo.columns)
for column in commonColumns:
    dtype_pred = dataForPrediction[column].dtype
    dtype_info = customerInfo[column].dtype
    if dtype_pred != dtype_info:
        print(f"Column '{column}' type mismatch: dataForPrediction={dtype_pred}, customerInfo={dtype_info}")

Column 'tenure' type mismatch: dataForPrediction=float64, customerInfo=int64


In [34]:
dataForPrediction['tenure'] = dataForPrediction['tenure'].astype('int64')
dataForPrediction.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,3,Month-to-month,No,Electronic check,74.82,184.82,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34,Month-to-month,Yes,Mailed check,59.778062,1752.55,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,1,Month-to-month,Yes,Electronic check,100.46,493.84,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7048,Female,0,No,Yes,5,Month-to-month,Yes,Electronic check,74.88,74.66,Yes,Yes,DSL,No internet service,No,No,Yes,No,Yes


In [35]:
sampleOfDataForPrediction = dataForPrediction.sample(n=min(500, len(dataForPrediction)), random_state=42)
for column in sampleOfDataForPrediction.columns:
    if sampleOfDataForPrediction[column].nunique() <= 10:
        print(f"\nColumn: {column}")
        values_pred = set(sampleOfDataForPrediction[column].unique())
        if column in customerInfo.columns:
            values_info = set(customerInfo[column].unique())
            print("Values in dataForPrediction:", values_pred)
            print("Values in customerInfo:", values_info)
            only_in_pred = values_pred - values_info
            only_in_info = values_info - values_pred
            if only_in_pred or only_in_info:
                print("Differences:")
                if only_in_pred:
                    print(" - Values only in dataForPrediction:", only_in_pred)
                if only_in_info:
                    print(" - Values only in customerInfo:", only_in_info)


Column: gender
Values in dataForPrediction: {'Male', 'Female'}
Values in customerInfo: {'Male', 'Female'}

Column: SeniorCitizen
Values in dataForPrediction: {np.int64(0), np.int64(1)}
Values in customerInfo: {np.int64(0), np.int64(1)}

Column: Partner
Values in dataForPrediction: {'No', 'Yes'}
Values in customerInfo: {'No', 'Yes'}

Column: Dependents
Values in dataForPrediction: {'No', 'Yes'}
Values in customerInfo: {'No', 'Yes'}

Column: Contract
Values in dataForPrediction: {'Month-to-month', 'One year', 'Two year'}
Values in customerInfo: {'Month-to-month', 'One year', 'Two year'}

Column: PaperlessBilling
Values in dataForPrediction: {'No', 'Yes'}
Values in customerInfo: {'No', 'Yes'}

Column: PaymentMethod
Values in dataForPrediction: {'Bank transfer (automatic)', 'Electronic check', 'Mailed check', 'Credit card (automatic)'}
Values in customerInfo: {'Bank transfer (automatic)', 'Electronic check', 'Mailed check', 'Credit card (automatic)'}

Column: PhoneService
Values in dataFo

In [36]:
dataForPrediction.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,3,Month-to-month,No,Electronic check,74.82,184.82,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34,Month-to-month,Yes,Mailed check,59.778062,1752.55,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,1,Month-to-month,Yes,Electronic check,100.46,493.84,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7048,Female,0,No,Yes,5,Month-to-month,Yes,Electronic check,74.88,74.66,Yes,Yes,DSL,No internet service,No,No,Yes,No,Yes


In [37]:
dataForPrediction.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
dtype: int64

In [38]:
customerInfo.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [39]:
column_mapping = {
    'customerID': 'customer_id',
    'gender': 'gender',
    'SeniorCitizen': 'senior_citizen',
    'Partner': 'has_partner',
    'Dependents': 'has_dependents',
    'tenure': 'tenure_in_months',
    'PhoneService': 'has_phone_srv',
    'MultipleLines': 'has_multiple_lines',
    'InternetService': 'has_internet_srv',
    'OnlineSecurity': 'has_online_security_srv',
    'OnlineBackup': 'has_online__backup_srv',
    'DeviceProtection': 'has_device_protection_srv',
    'TechSupport': 'has_tech_support_srv',
    'StreamingTV': 'has_streaming_tv_srv',
    'StreamingMovies': 'has_streaming_movies_srv',
    'Contract': 'contract_term',
    'PaperlessBilling': 'has_paperless_bill',
    'PaymentMethod': 'payment_method',
    'MonthlyCharges': 'monthly_charges',
    'TotalCharges': 'total_charges'
}
dataForPrediction.rename(columns=column_mapping, inplace=True)

# Binary conversion
dataForPrediction['has_partner'] = (dataForPrediction['has_partner'] == 'Yes').astype(int)
dataForPrediction['has_dependents'] = (dataForPrediction['has_dependents'] == 'Yes').astype(int)
dataForPrediction['has_phone_srv'] = (dataForPrediction['has_phone_srv'] == 'Yes').astype(int)
dataForPrediction['has_multiple_lines_yes'] = (dataForPrediction['has_multiple_lines'] == 'Yes').astype(int)
dataForPrediction.drop(columns=['has_multiple_lines'], inplace=True)

# Gender
dataForPrediction.drop(columns=['gender'], inplace=True)

# Internet service types
dataForPrediction['has_dsl'] = (dataForPrediction['has_internet_srv'] == 'DSL').astype(int)
dataForPrediction['has_fiber'] = (dataForPrediction['has_internet_srv'] == 'Fiber optic').astype(int)
dataForPrediction['no_internet_srv'] = (dataForPrediction['has_internet_srv'] == 'No').astype(int)

# Additional services
additional_srv_columns = [
    'has_online_security_srv', 'has_online__backup_srv',
    'has_device_protection_srv', 'has_tech_support_srv',
    'has_streaming_tv_srv', 'has_streaming_movies_srv'
]
for col in additional_srv_columns:
    dataForPrediction[col + '_yes'] = (dataForPrediction[col] == 'Yes').astype(int)
dataForPrediction.drop(columns=['has_internet_srv'] + additional_srv_columns, inplace=True)

# Contract term
dataForPrediction['month_to_month_contract'] = (dataForPrediction['contract_term'] == 'Month-to-month').astype(int)
dataForPrediction['one_year_contract'] = (dataForPrediction['contract_term'] == 'One year').astype(int)
dataForPrediction['two_years_contract'] = (dataForPrediction['contract_term'] == 'Two year').astype(int)
dataForPrediction['has_paperless_bill'] = (dataForPrediction['has_paperless_bill'] == 'Yes').astype(int)
dataForPrediction.drop(columns=['contract_term'], inplace=True)

# Payment methods
dataForPrediction['electronic_check'] = (dataForPrediction['payment_method'] == 'Electronic check').astype(int)
dataForPrediction['mailed_check'] = (dataForPrediction['payment_method'] == 'Mailed check').astype(int)
dataForPrediction['automatic_bank_transfer'] = (dataForPrediction['payment_method'] == 'Bank transfer (automatic)').astype(int)
dataForPrediction['automatic_credit_card'] = (dataForPrediction['payment_method'] == 'Credit card (automatic)').astype(int)
dataForPrediction.drop(columns=['payment_method'], inplace=True)

# Total charges fix
dataForPrediction['total_charges'] = pd.to_numeric(dataForPrediction['total_charges'], errors='coerce').fillna(0)

dataForPrediction.head()

Unnamed: 0,customer_id,senior_citizen,has_partner,has_dependents,tenure_in_months,has_paperless_bill,monthly_charges,total_charges,has_phone_srv,has_multiple_lines_yes,...,has_tech_support_srv_yes,has_streaming_tv_srv_yes,has_streaming_movies_srv_yes,month_to_month_contract,one_year_contract,two_years_contract,electronic_check,mailed_check,automatic_bank_transfer,automatic_credit_card
0,id7044,0,1,0,3,0,74.82,184.82,1,0,...,1,0,1,1,0,0,1,0,0,0
1,id7045,0,0,1,29,1,40.301502,1359.7,0,1,...,0,1,1,1,0,0,0,0,0,1
2,id7046,0,1,1,34,1,59.778062,1752.55,1,0,...,0,0,0,1,0,0,0,1,0,0
3,id7047,0,0,0,1,1,100.46,493.84,1,1,...,0,0,1,1,0,0,1,0,0,0
4,id7048,0,0,1,5,1,74.88,74.66,1,1,...,1,0,1,1,0,0,1,0,0,0


In [40]:
print(dataForPrediction.dtypes)

customer_id                       object
senior_citizen                     int64
has_partner                        int64
has_dependents                     int64
tenure_in_months                   int64
has_paperless_bill                 int64
monthly_charges                  float64
total_charges                    float64
has_phone_srv                      int64
has_multiple_lines_yes             int64
has_dsl                            int64
has_fiber                          int64
no_internet_srv                    int64
has_online_security_srv_yes        int64
has_online__backup_srv_yes         int64
has_device_protection_srv_yes      int64
has_tech_support_srv_yes           int64
has_streaming_tv_srv_yes           int64
has_streaming_movies_srv_yes       int64
month_to_month_contract            int64
one_year_contract                  int64
two_years_contract                 int64
electronic_check                   int64
mailed_check                       int64
automatic_bank_t

## Stage 4: Prediction

In [41]:
!pip install joblib
import joblib
model = joblib.load("random_forest_best_model.joblib")
customerIdsColumn = dataForPrediction['customer_id']
onlyNeededData = dataForPrediction.drop(columns=['customer_id'])
onlyNeededData = onlyNeededData[model.feature_names_in_]
dataForPrediction['churn'] = model.predict(onlyNeededData)
dataForPrediction['customer_id'] = customerIdsColumn




[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
dataForPrediction

Unnamed: 0,customer_id,senior_citizen,has_partner,has_dependents,tenure_in_months,has_paperless_bill,monthly_charges,total_charges,has_phone_srv,has_multiple_lines_yes,...,has_streaming_tv_srv_yes,has_streaming_movies_srv_yes,month_to_month_contract,one_year_contract,two_years_contract,electronic_check,mailed_check,automatic_bank_transfer,automatic_credit_card,churn
0,id7044,0,1,0,3,0,74.820000,184.82,1,0,...,0,1,1,0,0,1,0,0,0,0
1,id7045,0,0,1,29,1,40.301502,1359.70,0,1,...,1,1,1,0,0,0,0,0,1,0
2,id7046,0,1,1,34,1,59.778062,1752.55,1,0,...,0,0,1,0,0,0,1,0,0,0
3,id7047,0,0,0,1,1,100.460000,493.84,1,1,...,0,1,1,0,0,1,0,0,0,0
4,id7048,0,0,1,5,1,74.880000,74.66,1,1,...,0,1,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,id32038,0,0,0,10,1,91.817322,867.30,1,1,...,1,0,1,0,0,0,0,0,1,1
24995,id32039,0,0,1,4,0,84.500000,476.82,1,1,...,0,0,1,0,0,1,0,0,0,0
24996,id32040,0,0,1,3,1,90.710000,329.33,1,1,...,1,0,1,0,0,1,0,0,0,1
24997,id32041,0,1,1,33,1,76.120814,238.15,1,1,...,0,0,1,0,0,0,1,0,0,0


## Stage 5. Data export

In [43]:
churnPredicted = dataForPrediction.set_index("customer_id")["churn"].map({1: "Yes", 0: "No"})
dataForPredictionOriginal["Churn"] = dataForPredictionOriginal["customerID"].map(churnPredicted)
dataForPredictionOriginal

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Services,Churn
0,id7044,Male,0,Yes,No,3.880000,Month-to-month,[No],Electronic check,74.820000,184.82,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",No
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,[Yes],Credit card (automatic),40.301502,1359.7,"{'PhoneService': 'No', 'MultipleLines': 'Yes',...",No
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,[Yes],Mailed check,59.778062,1752.55,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",No
3,id7047,Male,0,No,No,1.790000,Month-to-month,[Yes],Electronic check,100.460000,493.84,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
4,id7048,Female,0,No,Yes,5.580000,Month-to-month,[Yes],Electronic check,74.880000,74.66,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,id32039,Male,0,No,Yes,4.400000,Month-to-month,[No],Electronic check,84.500000,476.82,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
24996,id32040,Male,0,No,Yes,3.130000,Month-to-month,[Yes],Electronic check,90.710000,329.33,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",Yes
24997,id32041,Male,0,Yes,Yes,33.154407,Month-to-month,[Yes],Mailed check,76.120814,238.15,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
24998,id32042,Female,0,No,No,2.480000,Month-to-month,[Yes],Electronic check,100.320000,14.13,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",Yes


In [44]:
dataForPredictionOriginal['PaperlessBilling'] = dataForPredictionOriginal['PaperlessBilling'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)
dataForPredictionOriginal

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Services,Churn
0,id7044,Male,0,Yes,No,3.880000,Month-to-month,No,Electronic check,74.820000,184.82,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",No
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,"{'PhoneService': 'No', 'MultipleLines': 'Yes',...",No
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,Yes,Mailed check,59.778062,1752.55,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",No
3,id7047,Male,0,No,No,1.790000,Month-to-month,Yes,Electronic check,100.460000,493.84,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
4,id7048,Female,0,No,Yes,5.580000,Month-to-month,Yes,Electronic check,74.880000,74.66,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,id32039,Male,0,No,Yes,4.400000,Month-to-month,No,Electronic check,84.500000,476.82,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
24996,id32040,Male,0,No,Yes,3.130000,Month-to-month,Yes,Electronic check,90.710000,329.33,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",Yes
24997,id32041,Male,0,Yes,Yes,33.154407,Month-to-month,Yes,Mailed check,76.120814,238.15,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'...",No
24998,id32042,Female,0,No,No,2.480000,Month-to-month,Yes,Electronic check,100.320000,14.13,"{'PhoneService': 'Yes', 'MultipleLines': 'No',...",Yes


In [45]:
dataForPredictionOriginal['Services'] = dataForPredictionOriginal['Services'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
services_df = dataForPredictionOriginal['Services'].apply(pd.Series)
dataForPredictionOriginal = pd.concat([dataForPredictionOriginal.drop(columns=['Services']), services_df], axis=1)
dataForPredictionOriginal

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,...,Churn,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,3.880000,Month-to-month,No,Electronic check,74.820000,...,No,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,Yes,Credit card (automatic),40.301502,...,No,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,Yes,Mailed check,59.778062,...,No,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,1.790000,Month-to-month,Yes,Electronic check,100.460000,...,No,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7048,Female,0,No,Yes,5.580000,Month-to-month,Yes,Electronic check,74.880000,...,No,Yes,Yes,DSL,No internet service,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,id32039,Male,0,No,Yes,4.400000,Month-to-month,No,Electronic check,84.500000,...,No,Yes,Yes,DSL,Yes,No,No,No,No,No internet service
24996,id32040,Male,0,No,Yes,3.130000,Month-to-month,Yes,Electronic check,90.710000,...,Yes,Yes,Yes,Fiber optic,No internet service,No internet service,No,Yes,Yes,No
24997,id32041,Male,0,Yes,Yes,33.154407,Month-to-month,Yes,Mailed check,76.120814,...,No,Yes,Yes,Fiber optic,No,Yes,No,No,No,No internet service
24998,id32042,Female,0,No,No,2.480000,Month-to-month,Yes,Electronic check,100.320000,...,Yes,Yes,No,Fiber optic,No,No,No,Yes,Yes,No


In [46]:
dataForPredictionOriginal = dataForPredictionOriginal[customerInfo.columns.tolist()]
dataForPredictionOriginal

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,id7044,Male,0,Yes,No,3.880000,Yes,No,DSL,No internet service,...,No internet service,Yes,No,Yes,Month-to-month,No,Electronic check,74.820000,184.82,No
1,id7045,Male,0,No,Yes,29.779905,No,Yes,DSL,No internet service,...,Yes,No internet service,Yes,Yes,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,No
2,id7046,Female,0,Yes,Yes,34.546357,Yes,No,DSL,Yes,...,Yes,No,No internet service,No internet service,Month-to-month,Yes,Mailed check,59.778062,1752.55,No
3,id7047,Male,0,No,No,1.790000,Yes,Yes,DSL,Yes,...,Yes,No internet service,No,Yes,Month-to-month,Yes,Electronic check,100.460000,493.84,No
4,id7048,Female,0,No,Yes,5.580000,Yes,Yes,DSL,No internet service,...,No,Yes,No,Yes,Month-to-month,Yes,Electronic check,74.880000,74.66,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,id32039,Male,0,No,Yes,4.400000,Yes,Yes,DSL,Yes,...,No,No,No,No internet service,Month-to-month,No,Electronic check,84.500000,476.82,No
24996,id32040,Male,0,No,Yes,3.130000,Yes,Yes,Fiber optic,No internet service,...,No,Yes,Yes,No,Month-to-month,Yes,Electronic check,90.710000,329.33,Yes
24997,id32041,Male,0,Yes,Yes,33.154407,Yes,Yes,Fiber optic,No,...,No,No,No,No internet service,Month-to-month,Yes,Mailed check,76.120814,238.15,No
24998,id32042,Female,0,No,No,2.480000,Yes,No,Fiber optic,No,...,No,Yes,Yes,No,Month-to-month,Yes,Electronic check,100.320000,14.13,Yes


In [47]:
dataForPredictionOriginal.to_csv("predicted_customers.csv", index=False)