# 1. Library Import

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

# 2. Data Load

In [25]:
file_path = './database/'

data = pd.read_csv(file_path+'cell2celltrain.csv')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51047 entries, 0 to 51046
Data columns (total 58 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CustomerID                 51047 non-null  int64  
 1   Churn                      51047 non-null  object 
 2   MonthlyRevenue             50891 non-null  float64
 3   MonthlyMinutes             50891 non-null  float64
 4   TotalRecurringCharge       50891 non-null  float64
 5   DirectorAssistedCalls      50891 non-null  float64
 6   OverageMinutes             50891 non-null  float64
 7   RoamingCalls               50891 non-null  float64
 8   PercChangeMinutes          50680 non-null  float64
 9   PercChangeRevenues         50680 non-null  float64
 10  DroppedCalls               51047 non-null  float64
 11  BlockedCalls               51047 non-null  float64
 12  UnansweredCalls            51047 non-null  float64
 13  CustomerCareCalls          51047 non-null  flo

# 3. Data PreProcessing

## 3-1. Nan Value

In [27]:
data.isna().sum()

CustomerID                     0
Churn                          0
MonthlyRevenue               156
MonthlyMinutes               156
TotalRecurringCharge         156
DirectorAssistedCalls        156
OverageMinutes               156
RoamingCalls                 156
PercChangeMinutes            367
PercChangeRevenues           367
DroppedCalls                   0
BlockedCalls                   0
UnansweredCalls                0
CustomerCareCalls              0
ThreewayCalls                  0
ReceivedCalls                  0
OutboundCalls                  0
InboundCalls                   0
PeakCallsInOut                 0
OffPeakCallsInOut              0
DroppedBlockedCalls            0
CallForwardingCalls            0
CallWaitingCalls               0
MonthsInService                0
UniqueSubs                     0
ActiveSubs                     0
ServiceArea                   24
Handsets                       1
HandsetModels                  1
CurrentEquipmentDays           1
AgeHH1    

In [29]:
data = data.dropna()

## 3-2. Object Feature Encoding

In [30]:
object_features = data.select_dtypes(['object']).columns

object_features

Index(['Churn', 'ServiceArea', 'ChildrenInHH', 'HandsetRefurbished',
       'HandsetWebCapable', 'TruckOwner', 'RVOwner', 'Homeownership',
       'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings',
       'NonUSTravel', 'OwnsComputer', 'HasCreditCard', 'NewCellphoneUser',
       'NotNewCellphoneUser', 'OwnsMotorcycle', 'HandsetPrice',
       'MadeCallToRetentionTeam', 'CreditRating', 'PrizmCode', 'Occupation',
       'MaritalStatus'],
      dtype='object')

In [31]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for feature in object_features:
    data[feature] = encoder.fit_transform(data[feature])
    print(f"feature : {feature}")
    print(f"encoded : {encoder.classes_}")

feature : Churn
encoded : ['No' 'Yes']
feature : ServiceArea
encoded : ['AIRAIK803' 'AIRAND864' 'AIRASH828' 'AIRAUG706' 'AIRBEA843' 'AIRCAM803'
 'AIRCHA843' 'AIRCOL803' 'AIRELI252' 'AIRFLO843' 'AIRGAF864' 'AIRGEO843'
 'AIRGOL919' 'AIRGRE864' 'AIRGRN252' 'AIRGWD864' 'AIRHHI843' 'AIRHIC828'
 'AIRJAC910' 'AIRKIN252' 'AIRMAR828' 'AIRMOR828' 'AIRMYR843' 'AIRNEW803'
 'AIRNWB252' 'AIRORA803' 'AIRROA252' 'AIRROC252' 'AIRSAV912' 'AIRSPA864'
 'AIRSUM803' 'AIRWIL910' 'AIRWIN252' 'AIRWYV828' 'APCANN443' 'APCBAL410'
 'APCBEL443' 'APCBET240' 'APCEAS443' 'APCFCH703' 'APCFRD301' 'APCFRE540'
 'APCLEE703' 'APCLXT240' 'APCSAL443' 'APCSIL301' 'APCSVP443' 'APCWAL240'
 'APCWAR540' 'APCWAS202' 'APCWES443' 'ATHHAM423' 'ATHJHC423' 'ATHKIN423'
 'ATHLIM423' 'ATLALB912' 'ATLANE678' 'ATLATH706' 'ATLATL678' 'ATLATN423'
 'ATLBRU912' 'ATLCHA423' 'ATLCHN706' 'ATLCOL706' 'ATLDAL334' 'ATLDBL478'
 'ATLDOT334' 'ATLDTN706' 'ATLJCK901' 'ATLKNO423' 'ATLLAG706' 'ATLMAC912'
 'ATLMDV478' 'ATLMEM901' 'ATLNOR678' 'ATLOPE334' 'ATL

## 3-3. Data Split

In [32]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['ServiceArea', 'Churn'])
y = data['Churn']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    stratify=y,
    random_state=42
)

print(f"X_train shape : {X_train.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"X_val shape : {X_val.shape}")
print(f"y_val : {y_val.shape}")

X_train shape : (37314, 56)
y_train shape : (37314,)
X_val shape : (12438, 56)
y_val : (12438,)


# 4. Define Model, Train

In [33]:
from sklearn.ensemble import RandomForestClassifier

baseline = RandomForestClassifier()

baseline.fit(X_train,y_train)

# 5. Validation Model

In [34]:
from sklearn.metrics import classification_report

y_pred = baseline.predict(X_val)

print(classification_report(y_true=y_val, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.73      0.98      0.83      8877
           1       0.59      0.08      0.14      3561

    accuracy                           0.72     12438
   macro avg       0.66      0.53      0.49     12438
weighted avg       0.69      0.72      0.63     12438

