In [8]:
import pandas as pd,numpy as np, seaborn as sns, matplotlib.pyplot as plt, joblib
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix

In [15]:
df = pd.read_csv('../0. Dataset/Holdout set.csv')

In [16]:
df.isna().sum()

CustomerID                       0
Churn                        20000
MonthlyRevenue                  60
MonthlyMinutes                  60
TotalRecurringCharge            60
DirectorAssistedCalls           60
OverageMinutes                  60
RoamingCalls                    60
PercChangeMinutes              135
PercChangeRevenues             135
DroppedCalls                     0
BlockedCalls                     0
UnansweredCalls                  0
CustomerCareCalls                0
ThreewayCalls                    0
ReceivedCalls                    0
OutboundCalls                    0
InboundCalls                     0
PeakCallsInOut                   0
OffPeakCallsInOut                0
DroppedBlockedCalls              0
CallForwardingCalls              0
CallWaitingCalls                 0
MonthsInService                  0
UniqueSubs                       0
ActiveSubs                       0
ServiceArea                      4
Handsets                         0
HandsetModels       

In [17]:
def get_state(code):
    if not isinstance(code, str) or len(code) < 3:
        return np.nan
    metro_code = code[:3] 
    # Disambiguate 'SAN' based on area code
    if metro_code == 'SAN':
        area_code = code[-3:] if len(code) >= 9 else ''
        if area_code in ['512', '210', '830', '254', '361', '956']:  # Texas area codes
            return 'Texas'
        return 'California'  # Default for 'SAN' in other contexts (e.g., Santa Ana)
    return state_map.get(metro_code, np.nan)


state_map = {
        'NYC': 'New York', 'LAX': 'California', 'CHI': 'Illinois', 'HAR': 'Connecticut',
        'DAL': 'Texas', 'SAN': 'Texas', 'MIA': 'Florida', 'MIL': 'Wisconsin', 'NCR': 'North Carolina',
        'PHI': 'Pennsylvania', 'OKC': 'Oklahoma', 'MIN': 'Minnesota', 'OHI': 'Ohio', 'IND': 'Indiana',
        'SFR': 'California', 'LOU': 'Kentucky', 'APC': 'Maryland', 'BOS': 'Massachusetts', 'NOL': 'Louisiana',
        'SEA': 'Washington', 'DET': 'Michigan', 'PHX': 'Arizona', 'NEV': 'Nevada', 'OMA': 'Nebraska',
        'NNY': 'New York', 'STL': 'Missouri', 'FLN': 'Florida', 'HOU': 'Texas', 'PIT': 'Pennsylvania',
        'KCY': 'Kansas', 'ATL': 'Georgia', 'AIR': 'South Carolina', 'NMX': 'New Mexico', 'SHE': 'Virginia',
        'LAU': 'Mississippi', 'SLC': 'Utah', 'HWI': 'Hawaii', 'NSH': 'Tennessee', 'BIR': 'Alabama',
        'AWI': 'Wisconsin', 'DEN': 'Colorado', 'NOR': 'Minnesota', 'GCW': 'Mississippi', 'NVU': 'Nevada',
        'LAW': 'Texas', 'VAH': 'Virginia', 'SDA': 'South Dakota', 'SEW': 'Oregon', 'HOP': 'Montana',
        'SFU': 'California'
    }

df.drop(columns=['CustomerID','Churn'],axis=1,inplace=True)
df['HandsetPrice'] = df['HandsetPrice'].replace('Unknown', np.nan)
df['MaritalStatus'] = df['MaritalStatus'].replace('Unknown', np.nan)

yes_no_features = ['ChildrenInHH','HandsetRefurbished','HandsetWebCapable','TruckOwner','RVOwner','BuysViaMailOrder',
     'RespondsToMailOffers','OptOutMailings','NonUSTravel','OwnsComputer','HasCreditCard','NewCellphoneUser','NotNewCellphoneUser',
     'OwnsMotorcycle','MadeCallToRetentionTeam']

keep_values = {'CreditRating': ['2-High','1-Highest','3-Good'], 'Occupation': ['Professional','Crafts'], 
               'PrizmCode': ['Suburban','Town'],'ServiceArea': ['Texas', 'New York', 'California', 'Florida',
               'Maryland', 'Illinois','Georgia', 'Ohio', 'North Carolina', 'Nevada', 'Pennsylvania','Michigan', 
               'Massachusetts' ,'Washington', 'Kansas'] }

for feature in yes_no_features:
    df[feature] = [1 if value == 'Yes' else 0 for value in df[feature]]
df['Homeownership'] = [1 if value == 'Known' else 0 for value in df['Homeownership']]
df['ServiceArea'] = df['ServiceArea'].apply(lambda code: get_state(code))
for feature, values in keep_values.items():
    df[feature] = [value if value in values else 'Other' for value in df[feature]]
df = pd.get_dummies(df,columns=['CreditRating','PrizmCode','Occupation','ServiceArea'])

df['MaritalStatus'] = [1 if status == 'Yes' else 0 if status =='No' else np.nan for status in df['MaritalStatus']]


In [None]:
imputer = joblib.load('../2. Preprocessing/NullImputer.h5')

In [19]:
imputed = imputer.transform(df)

In [20]:
f = pd.DataFrame(imputed,columns= df.columns)

In [25]:
selected_features = ['MonthlyRevenue',
 'MonthlyMinutes',
 'TotalRecurringCharge',
 'DirectorAssistedCalls',
 'OverageMinutes',
 'RoamingCalls',
 'PercChangeMinutes',
 'PercChangeRevenues',
 'DroppedCalls',
 'UnansweredCalls',
 'CustomerCareCalls',
 'ThreewayCalls',
 'ReceivedCalls',
 'OutboundCalls',
 'InboundCalls',
 'PeakCallsInOut',
 'OffPeakCallsInOut',
 'CallWaitingCalls',
 'MonthsInService',
 'UniqueSubs',
 'ActiveSubs',
 'Handsets',
 'HandsetModels',
 'CurrentEquipmentDays',
 'AgeHH1',
 'AgeHH2',
 'HandsetRefurbished',
 'HandsetWebCapable',
 'Homeownership',
 'BuysViaMailOrder',
 'RespondsToMailOffers',
 'RetentionCalls',
 'RetentionOffersAccepted',
 'NotNewCellphoneUser',
 'IncomeGroup',
 'AdjustmentsToCreditRating',
 'HandsetPrice',
 'MadeCallToRetentionTeam',
 'CreditRating_1-Highest',
 'CreditRating_2-High',
 'CreditRating_3-Good',
 'CreditRating_Other',
 'PrizmCode_Suburban',
 'PrizmCode_Town',
 'ServiceArea_Florida',
 'ServiceArea_Kansas',
 'ServiceArea_Maryland',
 'ServiceArea_Michigan',
 'ServiceArea_Ohio',
 'ServiceArea_Texas',
 'ServiceArea_Washington']

In [None]:
f = f[selected_features]
model = joblib.load('../3. Modeling/Saved Models/DecisionTree.h5')

y_pred = model.predict(f)

f['Churn'] = pd.Series(y_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f['Churn'] = pd.Series(y_pred)


In [28]:
f.to_csv('Holdout data predicted.csv',index=False)