### Project 3: Big Data
Part 1: MongoDB

Import all necessary libraries for this project, including the custom created functions from the py script

In [1]:
import pymongo
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
import Functions as func
from pymongo import MongoClient

Returning Documents that have all fields and drop "_id"

In [2]:
client = MongoClient('mongodb://localhost:27017/')
db = client['BDA']
collection = db['Customers']

# A sample document is fetched using find_one() to determine the field names. I can check via MongoDB Compass that the first document has all the columns for this purpose
sample_document = collection.find_one()
if sample_document is None:
    print("The collection is empty.")
else:
    # Get all field names except '_id'
    field_names = [field for field in sample_document.keys() if field != '_id']

    # Construct the query to check the existence of all fields
    query = {field: {"$exists": True} for field in field_names}

    # Construct the projection to include all fields except '_id'
    projection = {field: 1 for field in field_names}
    projection["_id"] = 0

    # Fetch the documents that match the query and apply the projection
    documents = list(collection.find(query, projection))

    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(documents)



In [3]:
# Another way to find the documents without find_one without checking if the first document has all the columns in MongoDB Compass

# collection = db['Customers']
# all_fields = set()
# for document in collection.find():
#     all_fields.update(document.keys())
# print(all_fields)
#
# query1 = \
#     {'MonthlyCharges':  { '$exists': True },
#     'SeniorCitizen':    { '$exists': True },
#     'PaymentMethod':    { '$exists': True },
#     'PaperlessBilling': { '$exists': True },
#     'Dependents':       { '$exists': True },
#     'Services':         { '$exists': True },
#     'Dependents':       { '$exists': True },
#     'Services':         { '$exists': True },
#     'customerID':       { '$exists': True },
#     'tenure':           { '$exists': True },
#     'Partner':          { '$exists': True },
#     'gender':           { '$exists': True },
#     'TotalCharges':     { '$exists': True },
#     'Contract':         { '$exists': True }}
# #print(query1)
# cursor = db.Customers.find(query1,{'_id' : 0 })
# df = pd.DataFrame(list(cursor))
# df


Check it

In [4]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Services
0,id7044,Male,0,Yes,No,15.208851,Month-to-month,[No],Credit card (automatic),45.682782,651.55,"{'PhoneService': 'Yes', 'MultipleLines': 'No',..."
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,[Yes],Credit card (automatic),40.301502,1359.7,"{'PhoneService': 'No', 'MultipleLines': 'Yes',..."
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,[Yes],Mailed check,59.778062,1752.55,"{'PhoneService': 'Yes', 'MultipleLines': 'No',..."
3,id7047,Male,0,No,No,37.264434,Month-to-month,[Yes],Electronic check,58.392744,541.9,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
4,id7049,Female,1,No,No,31.188870,Two year,[No],Bank transfer (automatic),87.322896,845.6,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
...,...,...,...,...,...,...,...,...,...,...,...,...
23418,id32038,Male,0,No,No,10.304650,Month-to-month,[Yes],Credit card (automatic),91.817322,867.3,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
23419,id32039,Male,0,No,Yes,15.826004,Month-to-month,[No],Mailed check,71.692238,7962.2,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
23420,id32040,Male,0,No,Yes,33.852572,One year,[Yes],Mailed check,86.958880,374.8,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."
23421,id32041,Male,0,Yes,Yes,33.154407,Month-to-month,[Yes],Mailed check,76.120814,238.15,"{'PhoneService': 'Yes', 'MultipleLines': 'Yes'..."


Removing the Array and "Flattening" the SubDocuments

In [5]:
# Flatten the Services dictionary into individual columns
if 'Services' in df.columns and isinstance(df['Services'].iloc[0], dict):
    services_df = df['Services'].apply(pd.Series)
    services_df.columns = [col for col in services_df.columns]
    
    # Drop the original Services column and join the new columns
    df = df.drop(columns=['Services']).join(services_df)

# Convert PaperlessBilling array to a single column with 'Yes' or 'No'
if 'PaperlessBilling' in df.columns:
    df['PaperlessBilling'] = df['PaperlessBilling'].apply(lambda x: 'Yes' if 'Yes' in x else 'No')

# Display the cleaned DataFrame
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,id7044,Male,0,Yes,No,15.208851,Month-to-month,No,Credit card (automatic),45.682782,651.55,Yes,No,DSL,No internet service,Yes,No internet service,Yes,No,Yes
1,id7045,Male,0,No,Yes,29.779905,Month-to-month,Yes,Credit card (automatic),40.301502,1359.7,No,Yes,DSL,No internet service,Yes,Yes,No internet service,Yes,Yes
2,id7046,Female,0,Yes,Yes,34.546357,Month-to-month,Yes,Mailed check,59.778062,1752.55,Yes,No,DSL,Yes,No,Yes,No,No internet service,No internet service
3,id7047,Male,0,No,No,37.264434,Month-to-month,Yes,Electronic check,58.392744,541.9,Yes,Yes,DSL,Yes,No,Yes,No internet service,No,Yes
4,id7049,Female,1,No,No,31.188870,Two year,No,Bank transfer (automatic),87.322896,845.6,Yes,Yes,Fiber optic,No,Yes,Yes,No internet service,No internet service,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,id32038,Male,0,No,No,10.304650,Month-to-month,Yes,Credit card (automatic),91.817322,867.3,Yes,Yes,Fiber optic,No internet service,Yes,Yes,Yes,Yes,No internet service
23419,id32039,Male,0,No,Yes,15.826004,Month-to-month,No,Mailed check,71.692238,7962.2,Yes,Yes,DSL,Yes,No,No,No,No,No internet service
23420,id32040,Male,0,No,Yes,33.852572,One year,Yes,Mailed check,86.958880,374.8,Yes,Yes,Fiber optic,No internet service,No internet service,No,Yes,Yes,No
23421,id32041,Male,0,Yes,Yes,33.154407,Month-to-month,Yes,Mailed check,76.120814,238.15,Yes,Yes,Fiber optic,No,Yes,No,No,No,No internet service


## Checking NaNs and Blanks

In [6]:
nan_check = df.isna().sum()

# Check for blanks (only for string columns)
blank_check = df.apply(lambda x: x.str.isspace().sum() if x.dtype == "object" else 0)

# Combine the results
missing_values_check = nan_check + blank_check

# Display the results
print(missing_values_check)

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        5
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
dtype: int64


As in previous dataset from Project ML, only totalcharges has blank, we can apply the built cleaning function as in that previous project without adjustments

In [7]:
df = func.preprocess_data(df)
df



Unnamed: 0,customerid,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,streamingtv_No,streamingtv_No internet service,streamingtv_Yes,streamingmovies_No,streamingmovies_No internet service,streamingmovies_Yes
0,7044.0,0.0,15.208851,45.682782,651.55,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,7045.0,0.0,29.779905,40.301502,1359.70,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,7046.0,0.0,34.546357,59.778062,1752.55,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,7047.0,0.0,37.264434,58.392744,541.90,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,7049.0,1.0,31.188870,87.322896,845.60,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,32038.0,0.0,10.304650,91.817322,867.30,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
23419,32039.0,0.0,15.826004,71.692238,7962.20,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
23420,32040.0,0.0,33.852572,86.958880,374.80,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
23421,32041.0,0.0,33.154407,76.120814,238.15,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Slight adjustments to the column names to ensure uniformity for the ML model

In [8]:
df.columns = df.columns.str.replace('internet service', '', regex=False).str.replace('\s+', '_', regex=True)
df.columns = df.columns.str.replace('_', ' ').str.replace(' ', '_').str.replace('No_internet_service', 'No internet service').str.replace('Fiber_optic', 'Fiber optic').str.replace('No_phone_service', 'No phone service')
df.columns = df.columns.str.replace('No internet service', 'No').str.replace('No phone service', 'No')
df.columns = df.columns.str.replace('paymentmethod_Bank_transfer_(automatic)', 'paymentmethod_Bank transfer (automatic)').str.replace('paymentmethod_Credit_card_(automatic)', 'paymentmethod_Credit card (automatic)').str.replace('paymentmethod_Electronic_check', 'paymentmethod_Electronic check').str.replace('paymentmethod_Mailed_check', 'paymentmethod_Mailed check', regex=False)
df.columns = df.columns.str.replace('contract_One_year', 'contract_One year').str.replace('contract_Two_year', 'contract_Two year')
df.columns = df.columns.str.replace('No_', 'No')
df

Unnamed: 0,customerid,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,deviceprotection_Yes,techsupport_No,techsupport_No.1,techsupport_Yes,streamingtv_No,streamingtv_No.1,streamingtv_Yes,streamingmovies_No,streamingmovies_No.1,streamingmovies_Yes
0,7044.0,0.0,15.208851,45.682782,651.55,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,7045.0,0.0,29.779905,40.301502,1359.70,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,7046.0,0.0,34.546357,59.778062,1752.55,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,7047.0,0.0,37.264434,58.392744,541.90,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,7049.0,1.0,31.188870,87.322896,845.60,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,32038.0,0.0,10.304650,91.817322,867.30,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
23419,32039.0,0.0,15.826004,71.692238,7962.20,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
23420,32040.0,0.0,33.852572,86.958880,374.80,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
23421,32041.0,0.0,33.154407,76.120814,238.15,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Reorganize column order for model

In [9]:
# Load the model
clf = func.load_model('churn_model.pkl')

# The expected feature order used during model training
expected_order = ['customerid', 'seniorcitizen', 'tenure', 'monthlycharges',
                  'totalcharges', 'gender_Female', 'gender_Male', 'partner_No',
                  'partner_Yes', 'dependents_No', 'dependents_Yes', 'phoneservice_No',
                  'phoneservice_Yes', 'multiplelines_No', 'multiplelines_Yes', 
                  'internetservice_DSL', 'internetservice_Fiber optic', 'internetservice_No',
                  'onlinesecurity_No', 'onlinesecurity_Yes', 'onlinebackup_No', 'onlinebackup_Yes',
                  'deviceprotection_No', 'deviceprotection_Yes', 'techsupport_No', 'techsupport_Yes',
                  'streamingtv_No', 'streamingtv_Yes', 'streamingmovies_No', 'streamingmovies_Yes',
                  'contract_Month-to-month', 'contract_One year', 'contract_Two year', 
                  'paperlessbilling_No', 'paperlessbilling_Yes', 
                  'paymentmethod_Bank transfer (automatic)', 'paymentmethod_Credit card (automatic)',
                  'paymentmethod_Electronic check', 'paymentmethod_Mailed check']

# Prepare features for prediction by excluding 'customerid'
feature_columns = [col for col in expected_order if col in df.columns and col != 'customerid']
features = df[feature_columns]

# Predict churn
predictions = func.predict_churn(clf, features)

# Add the predictions as a new column to the original DataFrame
df['predicted_churn'] = predictions

# Display the DataFrame with predictions
print(df)

       customerid  seniorcitizen     tenure  monthlycharges  totalcharges  \
0          7044.0            0.0  15.208851       45.682782        651.55   
1          7045.0            0.0  29.779905       40.301502       1359.70   
2          7046.0            0.0  34.546357       59.778062       1752.55   
3          7047.0            0.0  37.264434       58.392744        541.90   
4          7049.0            1.0  31.188870       87.322896        845.60   
...           ...            ...        ...             ...           ...   
23418     32038.0            0.0  10.304650       91.817322        867.30   
23419     32039.0            0.0  15.826004       71.692238       7962.20   
23420     32040.0            0.0  33.852572       86.958880        374.80   
23421     32041.0            0.0  33.154407       76.120814        238.15   
23422     32042.0            0.0  32.327198       98.706364       2021.20   

       gender_Female  gender_Male  partner_No  partner_Yes  dependents_No  

In [10]:
df

Unnamed: 0,customerid,seniorcitizen,tenure,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,techsupport_No,techsupport_No.1,techsupport_Yes,streamingtv_No,streamingtv_No.1,streamingtv_Yes,streamingmovies_No,streamingmovies_No.1,streamingmovies_Yes,predicted_churn
0,7044.0,0.0,15.208851,45.682782,651.55,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,7045.0,0.0,29.779905,40.301502,1359.70,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,7046.0,0.0,34.546357,59.778062,1752.55,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,7047.0,0.0,37.264434,58.392744,541.90,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,7049.0,1.0,31.188870,87.322896,845.60,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,32038.0,0.0,10.304650,91.817322,867.30,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
23419,32039.0,0.0,15.826004,71.692238,7962.20,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
23420,32040.0,0.0,33.852572,86.958880,374.80,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
23421,32041.0,0.0,33.154407,76.120814,238.15,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


Transforming the dataframe back into standard format

In [11]:
items = ["streamingtv", "streamingmovies", "techsupport", "onlinebackup", "deviceprotection",
        "onlinesecurity", "multiplelines", "phoneservice", "paperlessbilling", "partner", "dependents"]

for item in items:
    df[item] = df[f'{item}_Yes'].apply(lambda x: 'Yes' if x == 1 else 'No')

# Transform `predicted_churn` and `seniorcitizen` columns
df['predicted_churn'] = df['predicted_churn'].apply(lambda x: 'Yes' if x == 1 else 'No')
df['seniorcitizen'] = df['seniorcitizen'].apply(lambda x: 'Yes' if x == 1 else 'No')

# Combine payment method columns
df['paymentmethod'] = df[['paymentmethod_Credit card (automatic)', 'paymentmethod_Electronic check', 
                        'paymentmethod_Mailed check', 'paymentmethod_Bank transfer (automatic)']].idxmax(axis=1)
df['paymentmethod'] = df['paymentmethod'].apply(lambda x: x.split('_')[-1])

# Combine internet service columns
df['internetservice'] = df[['internetservice_DSL', 'internetservice_Fiber optic', 'internetservice_No']].idxmax(axis=1)
df['internetservice'] = df['internetservice'].apply(lambda x: x.split('_')[-1])

# Combine gender columns
df['gender'] = df.apply(lambda row: 'Female' if row['gender_Female'] == 1 else 'Male', axis=1)

# Combine contract columns
df['contract'] = df[['contract_Month-to-month', 'contract_One year', 'contract_Two year']].idxmax(axis=1)
df['contract'] = df['contract'].apply(lambda x: x.split('_')[-1])

# Drop original columns
df.drop(columns=[col for col in df.columns if any(sub in col for sub in ['_Yes', '_No', 'paymentmethod_', 'internetservice_', 'gender_', 'contract_'])], inplace=True)


df

Unnamed: 0,customerid,seniorcitizen,tenure,monthlycharges,totalcharges,predicted_churn,streamingtv,streamingmovies,techsupport,onlinebackup,...,onlinesecurity,multiplelines,phoneservice,paperlessbilling,partner,dependents,paymentmethod,internetservice,gender,contract
0,7044.0,No,15.208851,45.682782,651.55,No,No,Yes,Yes,Yes,...,No,No,Yes,No,Yes,No,Credit card (automatic),DSL,Male,Month-to-month
1,7045.0,No,29.779905,40.301502,1359.70,No,Yes,Yes,No,Yes,...,No,Yes,No,Yes,No,Yes,Credit card (automatic),DSL,Male,Month-to-month
2,7046.0,No,34.546357,59.778062,1752.55,No,No,No,No,No,...,Yes,No,Yes,Yes,Yes,Yes,Mailed check,DSL,Female,Month-to-month
3,7047.0,No,37.264434,58.392744,541.90,No,No,Yes,No,No,...,Yes,Yes,Yes,Yes,No,No,Electronic check,DSL,Male,Month-to-month
4,7049.0,Yes,31.188870,87.322896,845.60,No,No,No,No,Yes,...,No,Yes,Yes,No,No,No,Bank transfer (automatic),Fiber optic,Female,Two year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,32038.0,No,10.304650,91.817322,867.30,Yes,Yes,No,Yes,Yes,...,No,Yes,Yes,Yes,No,No,Credit card (automatic),Fiber optic,Male,Month-to-month
23419,32039.0,No,15.826004,71.692238,7962.20,No,No,No,No,No,...,Yes,Yes,Yes,No,No,Yes,Mailed check,DSL,Male,Month-to-month
23420,32040.0,No,33.852572,86.958880,374.80,No,Yes,No,Yes,No,...,No,Yes,Yes,Yes,No,Yes,Mailed check,Fiber optic,Male,One year
23421,32041.0,No,33.154407,76.120814,238.15,No,No,No,No,Yes,...,No,Yes,Yes,Yes,Yes,Yes,Mailed check,Fiber optic,Male,Month-to-month


Let's move the predicted churn to the right of customerid to highlight our predicted churn

In [12]:
df = df[['customerid', 'predicted_churn'] + [col for col in df.columns if col not in ['customerid', 'predicted_churn']]]

df

Unnamed: 0,customerid,predicted_churn,seniorcitizen,tenure,monthlycharges,totalcharges,streamingtv,streamingmovies,techsupport,onlinebackup,...,onlinesecurity,multiplelines,phoneservice,paperlessbilling,partner,dependents,paymentmethod,internetservice,gender,contract
0,7044.0,No,No,15.208851,45.682782,651.55,No,Yes,Yes,Yes,...,No,No,Yes,No,Yes,No,Credit card (automatic),DSL,Male,Month-to-month
1,7045.0,No,No,29.779905,40.301502,1359.70,Yes,Yes,No,Yes,...,No,Yes,No,Yes,No,Yes,Credit card (automatic),DSL,Male,Month-to-month
2,7046.0,No,No,34.546357,59.778062,1752.55,No,No,No,No,...,Yes,No,Yes,Yes,Yes,Yes,Mailed check,DSL,Female,Month-to-month
3,7047.0,No,No,37.264434,58.392744,541.90,No,Yes,No,No,...,Yes,Yes,Yes,Yes,No,No,Electronic check,DSL,Male,Month-to-month
4,7049.0,No,Yes,31.188870,87.322896,845.60,No,No,No,Yes,...,No,Yes,Yes,No,No,No,Bank transfer (automatic),Fiber optic,Female,Two year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,32038.0,Yes,No,10.304650,91.817322,867.30,Yes,No,Yes,Yes,...,No,Yes,Yes,Yes,No,No,Credit card (automatic),Fiber optic,Male,Month-to-month
23419,32039.0,No,No,15.826004,71.692238,7962.20,No,No,No,No,...,Yes,Yes,Yes,No,No,Yes,Mailed check,DSL,Male,Month-to-month
23420,32040.0,No,No,33.852572,86.958880,374.80,Yes,No,Yes,No,...,No,Yes,Yes,Yes,No,Yes,Mailed check,Fiber optic,Male,One year
23421,32041.0,No,No,33.154407,76.120814,238.15,No,No,No,Yes,...,No,Yes,Yes,Yes,Yes,Yes,Mailed check,Fiber optic,Male,Month-to-month


I noticed in Spark we leave the id infront of the customer id, putting that back as in original raw csv

In [13]:
df['customerid'] = 'id' + df['customerid'].astype(int).astype(str)

df

Unnamed: 0,customerid,predicted_churn,seniorcitizen,tenure,monthlycharges,totalcharges,streamingtv,streamingmovies,techsupport,onlinebackup,...,onlinesecurity,multiplelines,phoneservice,paperlessbilling,partner,dependents,paymentmethod,internetservice,gender,contract
0,id7044,No,No,15.208851,45.682782,651.55,No,Yes,Yes,Yes,...,No,No,Yes,No,Yes,No,Credit card (automatic),DSL,Male,Month-to-month
1,id7045,No,No,29.779905,40.301502,1359.70,Yes,Yes,No,Yes,...,No,Yes,No,Yes,No,Yes,Credit card (automatic),DSL,Male,Month-to-month
2,id7046,No,No,34.546357,59.778062,1752.55,No,No,No,No,...,Yes,No,Yes,Yes,Yes,Yes,Mailed check,DSL,Female,Month-to-month
3,id7047,No,No,37.264434,58.392744,541.90,No,Yes,No,No,...,Yes,Yes,Yes,Yes,No,No,Electronic check,DSL,Male,Month-to-month
4,id7049,No,Yes,31.188870,87.322896,845.60,No,No,No,Yes,...,No,Yes,Yes,No,No,No,Bank transfer (automatic),Fiber optic,Female,Two year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23418,id32038,Yes,No,10.304650,91.817322,867.30,Yes,No,Yes,Yes,...,No,Yes,Yes,Yes,No,No,Credit card (automatic),Fiber optic,Male,Month-to-month
23419,id32039,No,No,15.826004,71.692238,7962.20,No,No,No,No,...,Yes,Yes,Yes,No,No,Yes,Mailed check,DSL,Male,Month-to-month
23420,id32040,No,No,33.852572,86.958880,374.80,Yes,No,Yes,No,...,No,Yes,Yes,Yes,No,Yes,Mailed check,Fiber optic,Male,One year
23421,id32041,No,No,33.154407,76.120814,238.15,No,No,No,Yes,...,No,Yes,Yes,Yes,Yes,Yes,Mailed check,Fiber optic,Male,Month-to-month


Download the df into a csv

In [14]:
df.to_csv('predicted_churn.csv', index=False)