In [1]:
import numpy as np
import pandas as pd
import math
import scipy

# Collect data

### demographics

In [2]:
#user= dempgraphics?
users=pd.read_csv('demographics.csv')
users.columns=['Customer ID','Count','Gender','Age','Under_30','Senior_Citizen','Married','Dependents','Number_of_Dependents']
# do we still need Under 30/ Senior Citizen, or we need to group the age ?
users.drop(['Count'],axis=1,inplace=True)
users.drop(['Under_30','Senior_Citizen','Dependents'],axis=1,inplace=True)
users

Unnamed: 0,Customer ID,Gender,Age,Married,Number_of_Dependents
0,1746-TGTWV,Male,78.0,No,0.0
1,8499-BRXTD,Female,74.0,Yes,1.0
2,2041-JIJCI,Male,71.0,No,3.0
3,5832-EXGTT,Female,78.0,Yes,1.0
4,8350-NYMVI,,80.0,,1.0
...,...,...,...,...,...
6158,7609-NRNCA,Female,,No,0.0
6159,4835-YSJMR,Male,38.0,,2.0
6160,5067-DGXLL,,30.0,Yes,2.0
6161,3129-AAQOU,Female,32.0,Yes,2.0


### location

In [3]:
location=pd.read_csv('location.csv')
location.columns=['Customer ID','Count','Country','State','City','Zip Code','Lat Long','Latitude','Longtitude']
# Country, State are the same, latitude/longtitude's information is in Zip code
location.drop(['Count','Country','State','Lat Long','Latitude','Longtitude'],axis=1,inplace=True)
location.drop(['City'],axis=1,inplace=True)

### population

In [4]:
population=pd.read_csv('population.csv')
population.columns=['ID','Zip Code','Population']
population.drop(['ID'],axis=1,inplace=True)
population

Unnamed: 0,Zip Code,Population
0,90001,54492
1,90002,44586
2,90003,58198
3,90004,67852
4,90005,43019
...,...,...
1666,96145,4002
1667,96146,942
1668,96148,678
1669,96150,33038


### merge location and population ,and join users

In [5]:
location=pd.merge(location,population,on='Zip Code')
users=pd.merge(users,location,on='Customer ID',how='outer')
users

Unnamed: 0,Customer ID,Gender,Age,Married,Number_of_Dependents,Zip Code,Population
0,1746-TGTWV,Male,78.0,No,0.0,90022.0,68701.0
1,8499-BRXTD,Female,74.0,Yes,1.0,,
2,2041-JIJCI,Male,71.0,No,3.0,,
3,5832-EXGTT,Female,78.0,Yes,1.0,90303.0,27778.0
4,8350-NYMVI,,80.0,,1.0,90602.0,26265.0
...,...,...,...,...,...,...,...
6853,6839-ITVZJ,,,,,91934.0,699.0
6854,6821-JPCDC,,,,,93226.0,296.0
6855,5799-JRCZO,,,,,93311.0,20440.0
6856,8741-LQOBK,,,,,96057.0,1586.0


satisfaction

In [6]:
satisfaction=pd.read_csv('satisfaction.csv')
satisfaction.columns=['Customer ID','score']
users=pd.merge(users,satisfaction,on='Customer ID',how='outer')

services

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
services=pd.read_csv('services.csv')
services.columns=['Customer ID','Count','Quarter','Referred_a_friend','Number of Referrals','Tenure in Months','Offer','Phone_Service','Avg Monthly Long Distance Charges','Multiple_Lines','Internet_Service','Internet_Type','Avg Monthly GB Download','Online_Security','Online_Backup','Device_Protection_Plan','Premium_Tech_Support','Streaming_TV','Streaming_Movies','Streaming_Music','Unlimited_Data','Contract','Paperless_Billing','Payment_Method','Monthly Charge','Total Charges','Total Refunds','Total Extra Data Charges','Total Long Distance Charges','Total Revenue']
# Quarter are always Q3
services.drop(['Count','Quarter'],axis=1,inplace=True)
users=pd.merge(users,services,on='Customer ID',how='outer')
#columns_to_encode = [1,3,4,5,6,8,12,13,14,15,16,17,18,20,21,22,23,34,35,36,27,28,29]
#ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_to_encode)], remainder='passthrough')

# give null a category?
#users.loc[users.Married.isnull(),'Married']='None'
users.loc[users.Married.isnull(),'Married']='nan'
users = pd.concat((users,pd.get_dummies(users.Married,prefix='Married')),1)
users.loc[users.Gender.isnull(),'Gender']='nan'
users = pd.concat((users,pd.get_dummies(users.Gender,prefix='Gender')),1)
#users = pd.concat((users,pd.get_dummies(users.Under_30,prefix='Under_30')),1)
#users = pd.concat((users,pd.get_dummies(users.Senior_Citizen,prefix='Senior_Citizen')),1)
#users = pd.concat((users,pd.get_dummies(users.Dependents,prefix='Dependents')),1)
#users = pd.concat((users,pd.get_dummies(users.City,prefix='City')),1)
users.loc[users.Referred_a_friend.isnull(),'Referred_a_friend']='nan'
users = pd.concat((users,pd.get_dummies(users.Referred_a_friend,prefix='Referred_a_friend')),1)
users.loc[users.Offer.isnull(),'Offer']='nan'
users = pd.concat((users,pd.get_dummies(users.Offer,prefix='Offer')),1)
users.loc[users.Phone_Service.isnull(),'Phone_Service']='nan'
users = pd.concat((users,pd.get_dummies(users.Phone_Service,prefix='Phone_Service')),1)
users.loc[users.Multiple_Lines.isnull(),'Multiple_Lines']='nan'
users = pd.concat((users,pd.get_dummies(users.Multiple_Lines,prefix='Multiple_Lines')),1)
users.loc[users.Internet_Service.isnull(),'Internet_Service']='nan'
users = pd.concat((users,pd.get_dummies(users.Internet_Service,prefix='Internet_Service')),1)
users.loc[users.Internet_Type.isnull(),'Internet_Type']='nan'
users = pd.concat((users,pd.get_dummies(users.Internet_Type,prefix='Internet_Type')),1)
users.loc[users.Online_Security.isnull(),'Online_Security']='nan'
users = pd.concat((users,pd.get_dummies(users.Online_Security,prefix='Online_Security')),1)
users.loc[users.Online_Backup.isnull(),'Online_Backup']='nan'
users = pd.concat((users,pd.get_dummies(users.Online_Backup,prefix='Online_Backup')),1)
users.loc[users.Device_Protection_Plan.isnull(),'Device_Protection_Plan']='nan'
users = pd.concat((users,pd.get_dummies(users.Device_Protection_Plan,prefix='Device_Protection_Plan')),1)
users.loc[users.Premium_Tech_Support.isnull(),'Premium_Tech_Support']='nan'
users = pd.concat((users,pd.get_dummies(users.Premium_Tech_Support,prefix='Premium_Tech_Support')),1)
users.loc[users.Streaming_TV.isnull(),'Streaming_TV']='nan'
users = pd.concat((users,pd.get_dummies(users.Streaming_TV,prefix='Streaming_TV')),1)
users.loc[users.Streaming_Movies.isnull(),'Streaming_Movies']='nan'
users = pd.concat((users,pd.get_dummies(users.Streaming_Movies,prefix='Streaming_Movies')),1)
users.loc[users.Streaming_Music.isnull(),'Streaming_Music']='nan'
users = pd.concat((users,pd.get_dummies(users.Streaming_Music,prefix='Streaming_Music')),1)
users.loc[users.Unlimited_Data.isnull(),'Unlimited_Data']='nan'
users = pd.concat((users,pd.get_dummies(users.Unlimited_Data,prefix='Unlimited_Data')),1)
users.loc[users.Contract.isnull(),'Contract']='nan'
users = pd.concat((users,pd.get_dummies(users.Contract,prefix='Contract')),1)
users.loc[users.Paperless_Billing.isnull(),'Paperless_Billing']='nan'
users = pd.concat((users,pd.get_dummies(users.Paperless_Billing,prefix='Paperless_Billing')),1)
users.loc[users.Payment_Method.isnull(),'Payment_Method']='nan'
users = pd.concat((users,pd.get_dummies(users.Payment_Method,prefix='Payment_Method')),1)

users.drop(['Married','Gender','Phone_Service','Multiple_Lines','Internet_Service','Referred_a_friend','Offer','Internet_Type','Online_Security','Online_Backup','Device_Protection_Plan','Premium_Tech_Support','Streaming_TV','Streaming_Movies','Streaming_Music','Unlimited_Data','Contract','Paperless_Billing','Payment_Method'],axis=1,inplace=True)
'''
df["Married"]=pd.util.hash_array(df["Married"].to_numpy())
df["Gender"]=pd.util.hash_array(df["Gender"].to_numpy())
df["Under 30"]=pd.util.hash_array(df["Under 30"].to_numpy())
df["Senior Citizen"]=pd.util.hash_array(df["Senior Citizen"].to_numpy())
df["Dependents"]=pd.util.hash_array(df["Dependents"].to_numpy())
df["City"]=pd.util.hash_array(df["City"].to_numpy())
#df["Referred a friend"]=pd.util.hash_array(df["Referred a friend"].to_numpy())
df["Offer"]=pd.util.hash_array(df["Offer"].to_numpy())
df["Phone Service"]=pd.util.hash_array(df["Phone Service"].to_numpy())
df["Multiple Lines"]=pd.util.hash_array(df["Multiple Lines"].to_numpy())
df["Internet Service"]=pd.util.hash_array(df["Internet Service"].to_numpy())
df["Internet Type"]=pd.util.hash_array(df["Internet Type"].to_numpy())
df["Online Security"]=pd.util.hash_array(df["Online Security"].to_numpy())
df["Online Backup"]=pd.util.hash_array(df["Online Backup"].to_numpy())
df["Device Protection Plan"]=pd.util.hash_array(df["Device Protection Plan"].to_numpy())
df["Premium Tech Support"]=pd.util.hash_array(df["Premium Tech Support"].to_numpy())
df["Streaming TV"]=pd.util.hash_array(df["Streaming TV"].to_numpy())
df["Streaming Movies"]=pd.util.hash_array(df["Streaming Movies"].to_numpy())
df["Streaming Music"]=pd.util.hash_array(df["Streaming Music"].to_numpy())
df["Unlimited Data"]=pd.util.hash_array(df["Unlimited Data"].to_numpy())
df["Contract"]=pd.util.hash_array(df["Contract"].to_numpy())
df["Paperless Billing"]=pd.util.hash_array(df["Paperless Billing"].to_numpy())
df["Payment Method"]=pd.util.hash_array(df["Payment Method"].to_numpy())
'''
users

  users = pd.concat((users,pd.get_dummies(users.Married,prefix='Married')),1)
  users = pd.concat((users,pd.get_dummies(users.Gender,prefix='Gender')),1)
  users = pd.concat((users,pd.get_dummies(users.Referred_a_friend,prefix='Referred_a_friend')),1)
  users = pd.concat((users,pd.get_dummies(users.Offer,prefix='Offer')),1)
  users = pd.concat((users,pd.get_dummies(users.Phone_Service,prefix='Phone_Service')),1)
  users = pd.concat((users,pd.get_dummies(users.Multiple_Lines,prefix='Multiple_Lines')),1)
  users = pd.concat((users,pd.get_dummies(users.Internet_Service,prefix='Internet_Service')),1)
  users = pd.concat((users,pd.get_dummies(users.Internet_Type,prefix='Internet_Type')),1)
  users = pd.concat((users,pd.get_dummies(users.Online_Security,prefix='Online_Security')),1)
  users = pd.concat((users,pd.get_dummies(users.Online_Backup,prefix='Online_Backup')),1)
  users = pd.concat((users,pd.get_dummies(users.Device_Protection_Plan,prefix='Device_Protection_Plan')),1)
  users = pd.c

Unnamed: 0,Customer ID,Age,Number_of_Dependents,Zip Code,Population,score,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,...,Contract_One Year,Contract_Two Year,Contract_nan,Paperless_Billing_No,Paperless_Billing_Yes,Paperless_Billing_nan,Payment_Method_Bank Withdrawal,Payment_Method_Credit Card,Payment_Method_Mailed Check,Payment_Method_nan
0,1746-TGTWV,78.0,0.0,90022.0,68701.0,,0.0,1.0,0.00,8.0,...,0,0,0,0,1,0,1,0,0,0
1,8499-BRXTD,74.0,1.0,,,3.0,1.0,8.0,48.85,17.0,...,0,0,0,0,1,0,0,0,0,1
2,2041-JIJCI,71.0,3.0,,,,,,,,...,0,0,1,0,0,1,0,0,0,1
3,5832-EXGTT,78.0,1.0,90303.0,27778.0,2.0,1.0,25.0,19.76,12.0,...,0,0,0,0,1,0,1,0,0,0
4,8350-NYMVI,80.0,1.0,90602.0,26265.0,2.0,1.0,37.0,6.33,,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7037,0516-WJVXC,,,,,,2.0,1.0,34.28,0.0,...,0,0,0,1,0,0,0,1,0,0
7038,0388-EOPEX,,,,,,0.0,35.0,35.50,0.0,...,1,0,0,1,0,0,0,1,0,0
7039,7055-JCGNI,,,,,,0.0,,27.75,30.0,...,0,1,0,1,0,0,0,1,0,0
7040,6891-JPYFF,,,,,,0.0,1.0,38.05,,...,0,0,0,0,1,0,0,1,0,0


status

In [8]:
status=pd.read_csv('status.csv')
status.columns=['Customer ID','Churn Category']
status['ans'] = status['Churn Category']
status.loc[status.ans=='No Churn','ans']='0'
status.loc[status.ans=='Competitor','ans']='1'
status.loc[status.ans=='Dissatisfaction','ans']='2'
status.loc[status.ans=='Attitude','ans']='3'
status.loc[status.ans=='Price','ans']='4'
status.loc[status.ans=='Other','ans']='5'
status.drop(['Churn Category'],axis=1,inplace=True)
status.rename(columns={'ans':'Churn Category'}, inplace=True)
#final train data
train=pd.merge(status,users,on='Customer ID',how='left')

# Train Model

In [10]:
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC 
from sklearn.preprocessing import StandardScaler
from libsvm.svmutil import *

In [11]:
features=list(train)
#print(features)
features.remove('Customer ID')
features.remove('Churn Category')
#features.remove('Referred_a_Friend')
#features.remove('Offer')
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(train.loc[:, features])
train_imputed = imputer.fit_transform(train.loc[:, features])
df=pd.DataFrame(train_imputed)
poly = PolynomialFeatures(degree=2, interaction_only=False)
train_imputed = pd.DataFrame(poly.fit_transform(df))
scalar=StandardScaler()
train_imputed=scalar.fit_transform(train_imputed)
train_imputed = pd.DataFrame(train_imputed)
train_imputed


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3311,3312,3313,3314,3315,3316,3317,3318,3319,3320
0,0.0,-0.651541,-0.411472,0.025953,-0.191061,-2.105970,-0.526417,0.156651,-0.022530,-0.184340,...,1.177952,0.0,0.0,0.0,-0.657680,0.0,0.0,-0.208463,0.0,-0.557854
1,0.0,0.502076,-0.411472,0.053072,-0.350914,-2.105970,-0.526417,-1.418566,-1.705056,0.546027,...,-0.848931,0.0,0.0,0.0,-0.657680,0.0,0.0,-0.208463,0.0,1.792585
2,0.0,-0.040803,-0.411472,-0.508422,0.570800,-0.190587,0.961885,0.156651,1.347453,-0.184340,...,1.177952,0.0,0.0,0.0,-0.657680,0.0,0.0,-0.208463,0.0,-0.557854
3,0.0,-1.601578,-0.411472,1.020123,0.850185,-0.190587,-0.526417,-1.140587,-0.099177,2.905674,...,-0.848931,0.0,0.0,0.0,-0.657680,0.0,0.0,-0.208463,0.0,1.792585
4,0.0,-0.380102,-0.411472,1.633390,-1.125985,-0.190587,2.450186,1.685539,-0.431069,-1.083253,...,-0.848931,0.0,0.0,0.0,1.520497,0.0,0.0,-0.208463,0.0,-0.557854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4221,0.0,-1.601578,-0.411472,0.025953,-0.191061,0.767105,-0.526417,-1.140587,-1.705056,1.613487,...,-0.848931,0.0,0.0,0.0,1.520497,0.0,0.0,-0.208463,0.0,-0.557854
4222,0.0,-0.040803,-0.411472,0.278656,-0.664359,-0.190587,-0.526417,-0.260318,1.695716,-0.184340,...,-0.848931,0.0,0.0,0.0,1.520497,0.0,0.0,-0.208463,0.0,-0.557854
4223,0.0,0.909235,1.941546,0.025953,-0.191061,0.767105,-0.526417,-0.491968,-0.248752,0.546027,...,-0.848931,0.0,0.0,0.0,1.520497,0.0,0.0,-0.208463,0.0,-0.557854
4224,0.0,-0.040803,3.118056,1.017658,1.307248,-0.190587,-0.526417,-0.723617,0.321268,-0.184340,...,-0.848931,0.0,0.0,0.0,1.520497,0.0,0.0,-0.208463,0.0,-0.557854


In [12]:
# 10% vaildation 
x_test,x_train,y_test,y_train = train_test_split(train_imputed,train.loc[:, 'Churn Category'],test_size=0.90, random_state=0)

### SVM_linear 

In [13]:
for g in range(1):
    for c in range(1,2):
        svm_model_linear = SVC(kernel = 'rbf',gamma=math.pow(10,g), C = math.pow(10,c)).fit(x_train, y_train)
        svm_predictions = svm_model_linear.predict(x_test)
        accuracy = svm_model_linear.score(x_test, y_test)
        print("sklearn_svm",g,c,accuracy)

In [128]:
svm_model_linear = SVC(kernel = 'rbf',gamma=math.pow(10,0), C = math.pow(10,1)).fit(train_imputed, train.loc[:,'Churn Category'])

In [13]:

y_train_num=y_train.to_numpy()
y_train_num=y_train_num.astype(np.int)
x_train_num=x_train.to_numpy()
prob=svm_problem(y_train_num,x_train_num)
param = svm_parameter('-t 2 -c 10 -q')
libsvm_train=svm_train(prob,param)
y_test_num=y_test.to_numpy()
y_test_num=y_test_num.astype(np.int)
x_test_num=x_test.to_numpy()
p_label, p_acc, p_val=svm_predict(y_train_num,x_train_num,libsvm_train)

Accuracy = 97.6604% (3715/3804) (classification)


In [14]:
y_test_num=y_test_num.astype(np.int)
p_label, p_acc, p_val=svm_predict(y_test_num,x_test_num,libsvm_train)

y_train_all=train.loc[:,'Churn Category'].to_numpy()
y_train_all=y_train_all.astype(np.int)
x_train_all=train_imputed.to_numpy()
prob_all=svm_problem(y_train_all,x_train_all)
libsvm_train=svm_train(prob_all,param)

Accuracy = 77.4882% (327/422) (classification)


### decision tree

In [15]:
from sklearn.tree import DecisionTreeClassifier
dtree_model = DecisionTreeClassifier(max_depth = 1).fit(x_train, y_train)
dtree_predictions = dtree_model.predict(x_test)
accuracy = dtree_model.score(x_test, y_test)
print("dtree",accuracy)
dtree_model = DecisionTreeClassifier(max_depth = 1).fit(train_imputed,train.loc[:,'Churn Category'])

dtree 0.8033175355450237


### knn 

In [132]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 10).fit(x_train, y_train)
accuracy = knn.score(x_test, y_test)
print("knn",accuracy)


knn 0.7440758293838863


### RandomForest + ada boost

In [29]:
from sklearn.ensemble import RandomForestClassifier
for tree in range(10,11):
    for d in range(10,11):
        rf = RandomForestClassifier(n_estimators = tree*1000, max_depth=d,oob_score=True)
        rf.fit(train_imputed,train.loc[:, 'Churn Category'])
        print(f"tree:{tree*1000},d:{d},acc:{rf.oob_score_}")


In [27]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators = 20,learning_rate=0.2)
ada.fit(x_train, y_train)
print("ada")
ada.score(x_test, y_test)
ada.fit(train_imputed,train.loc[:, 'Churn Category'])

ada


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   learning_rate=0.2, n_estimators=20)

### Prediction

### handle test

In [18]:
testID=pd.read_csv('Test_IDs.csv')
testID.columns=['Customer ID']
dftest=pd.DataFrame(testID)
test=pd.merge(dftest,users,on='Customer ID',how='left')
#print(test)
test_imputed = imputer.fit_transform(test.loc[:, features])
df=pd.DataFrame(test_imputed)
poly = PolynomialFeatures(degree=2, interaction_only=False)
test_imputed = pd.DataFrame(poly.fit_transform(df))
scalar=StandardScaler()
test_imputed=scalar.fit_transform(test_imputed)
test_imputed = pd.DataFrame(test_imputed)
#print(train_imputed.shape)

### SVM prediction

In [19]:
#dftest['Churn Category']=svm_model_linear.predict(test_imputed)
total_rows=test_imputed.shape[0]
test_imputed=test_imputed.to_numpy()
fake_y=np.zeros(total_rows)
p_label, p_acc, p_val=svm_predict(fake_y,test_imputed,libsvm_train)
p_label = list(map(int, p_label))
dftest['Churn Category']=p_label


Accuracy = 81.2633% (1145/1409) (classification)


### not svm

In [None]:
#dftest['Churn Category']=dtree_model.predict(test_imputed)

#dftest['Churn Category']=knn.predict(test_imputed)

dftest['Churn Category']=rf.predict(test_imputed)

#dftest['Churn Category']=ada.predict(test_imputed)

### Output result

In [None]:
dftest.columns=['Customer ID','Churn Category']
submiss=pd.DataFrame(dftest)
submiss.to_csv('submission.csv',index=False)