# Predict the churn risk rate (HackerEarth)

# Task
Is to predict the churn score for a website based on the features provided in the dataset.

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing dataset and reading dataset
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,fffe4300490044003600300030003800,Pattie Morrisey,18,F,XW0DQ7H,Village,Platinum Membership,2017-08-17,No,xxxxxxxx,...,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
1,fffe43004900440032003100300035003700,Traci Peery,32,F,5K0N3X1,City,Premium Membership,2017-08-28,?,CID21329,...,306.34,12838.38,10.0,,Yes,No,Yes,Solved,Quality Customer Care,1
2,fffe4300490044003100390032003600,Merideth Mcmeen,44,F,1F2TCL3,Town,No Membership,2016-11-11,Yes,CID12313,...,516.16,21027.0,22.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5
3,fffe43004900440036003000330031003600,Eufemia Cardwell,37,M,VJGJ33N,City,No Membership,2016-10-29,Yes,CID3793,...,53.27,25239.56,6.0,567.66,No,Yes,Yes,Unsolved,Poor Website,5
4,fffe43004900440031003900350030003600,Meghan Kosak,31,F,SVZXCWB,City,No Membership,2017-09-12,No,xxxxxxxx,...,113.13,24483.66,16.0,663.06,No,Yes,Yes,Solved,Poor Website,5
5,fffe43004900440036003300320035003300,Leslie Browder,13,M,PSG1LGF,City,Gold Membership,2016-01-08,No,xxxxxxxx,...,433.62,13884.77,24.0,722.27,Yes,No,Yes,Unsolved,No reason specified,3
6,fffe43004900440031003100360037003900,Bridget Balog,21,M,R3CX1EA,Town,Gold Membership,2015-03-19,Yes,CID24708,...,55.38,8982.5,28.0,756.21,Yes,No,Yes,Solved in Follow-up,No reason specified,3
7,fffe4300490044003800300035003800,Herma Torgeson,42,M,4UJ1551,,No Membership,2016-07-12,?,CID56614,...,429.11,44554.82,24.0,568.08,No,Yes,Yes,Unsolved,Poor Product Quality,5
8,fffe43004900440033003300330032003200,Pattie Helmers,44,M,0481QNQ,Village,Silver Membership,2016-12-14,No,xxxxxxxx,...,191.07,18362.31,20.0,,Yes,No,Yes,Solved in Follow-up,Poor Customer Service,3
9,fffe43004900440032003000340038003300,Shaquana Leech,45,F,ZHP4MCR,Town,No Membership,2016-11-30,No,xxxxxxxx,...,97.31,19244.16,28.0,706.23,No,Yes,Yes,No Information Available,Poor Customer Service,4


In [354]:
# Shape of the dataset
df.shape

(36992, 25)

In [50]:
# Columns / Features names
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

# Data Preprocessing / EDA

In [355]:
# Removing In-significant features
df1 = df.drop(['security_no','Name','referral_id','joining_date','last_visit_time','customer_id'], axis=1)
df1.head(2)

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,18,F,Village,Platinum Membership,No,Gift Vouchers/Coupons,?,Wi-Fi,17,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
1,32,F,City,Premium Membership,?,Gift Vouchers/Coupons,Desktop,Mobile_Data,16,306.34,12838.38,10.0,,Yes,No,Yes,Solved,Quality Customer Care,1


In [350]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           36992 non-null  int64  
 1   gender                        36933 non-null  object 
 2   region_category               31564 non-null  object 
 3   membership_category           36992 non-null  object 
 4   joined_through_referral       31554 non-null  object 
 5   preferred_offer_types         36704 non-null  object 
 6   medium_of_operation           31599 non-null  object 
 7   internet_option               36992 non-null  object 
 8   days_since_last_login         36992 non-null  int64  
 9   avg_time_spent                36992 non-null  float64
 10  avg_transaction_value         36992 non-null  float64
 11  avg_frequency_login_days      33470 non-null  float64
 12  points_in_wallet              33549 non-null  float64
 13  u

In [356]:
# Check any unknown values or special symbols for categorical features in dataset
# Replace 'Unknown' with NaN values
df1['gender'].value_counts()
df1['gender'] = df1['gender'].replace('Unknown', np.nan)

F          18490
M          18443
Unknown       59
Name: gender, dtype: int64

In [357]:
# In this column no unknown or special symbols are found
df1['region_category'].value_counts()

Town       14128
City       12737
Village     4699
Name: region_category, dtype: int64

In [361]:
# Check any unknown values or special symbols for categorical features in dataset
df1['membership_category'].value_counts()

Basic Membership       7724
No Membership          7692
Gold Membership        6795
Silver Membership      5988
Premium Membership     4455
Platinum Membership    4338
Name: membership_category, dtype: int64

In [362]:
# Check any unknown values or special symbols for categorical features in dataset
df1['joined_through_referral'].value_counts()

No     15839
Yes    15715
?       5438
Name: joined_through_referral, dtype: int64

In [359]:
# Check any unknown values or special symbols for categorical features in dataset
df1['preferred_offer_types'].value_counts()

Gift Vouchers/Coupons       12349
Credit/Debit Card Offers    12274
Without Offers              12081
Name: preferred_offer_types, dtype: int64

In [360]:
# Check any unknown values or special symbols for categorical features in dataset
# Replace 'Unknown' with NaN values
df1['medium_of_operation'].value_counts()
df1['medium_of_operation'] = df1['medium_of_operation'].replace('?', np.nan)

Desktop       13913
Smartphone    13876
?              5393
Both           3810
Name: medium_of_operation, dtype: int64

In [363]:
# Check any unknown values or special symbols for categorical features in dataset
df1['internet_option'].value_counts()

Wi-Fi          12413
Mobile_Data    12343
Fiber_Optic    12236
Name: internet_option, dtype: int64

In [364]:
# Check any unknown values or special symbols for categorical features in dataset
df1['used_special_discount'].value_counts()

Yes    20342
No     16650
Name: used_special_discount, dtype: int64

In [365]:
# Check any unknown values or special symbols for categorical features in dataset
df1['past_complaint'].value_counts()

No     18602
Yes    18390
Name: past_complaint, dtype: int64

In [366]:
# Check any unknown values or special symbols for categorical features in dataset
df1['complaint_status'].value_counts()

Not Applicable              18602
Unsolved                     4644
Solved                       4619
Solved in Follow-up          4577
No Information Available     4550
Name: complaint_status, dtype: int64

In [367]:
# Check any unknown values or special symbols for categorical features in dataset
df1['feedback'].value_counts()

Poor Product Quality        6350
No reason specified         6290
Too many ads                6279
Poor Website                6271
Poor Customer Service       6252
Reasonable Price            1417
User Friendly Website       1391
Products always in Stock    1382
Quality Customer Care       1360
Name: feedback, dtype: int64

In [368]:
df1.columns

Index(['age', 'gender', 'region_category', 'membership_category',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'days_since_last_login',
       'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days',
       'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [370]:
# Check any unknown values or special symbols for continuous features in dataset
# Replace 'Unknown' with NaN values
df1['avg_frequency_login_days'].value_counts()
df1['avg_frequency_login_days'] = df1['avg_frequency_login_days'].replace('Error', np.nan)

Error                  3522
13.0                   1394
19.0                   1365
8.0                    1361
14.0                   1355
                       ... 
49.62913718430662         1
-8.388346730342558        1
-5.326618309040633        1
50.71743119927807         1
-0.7861176834842816       1
Name: avg_frequency_login_days, Length: 1654, dtype: int64

In [371]:
# Changing the datatype to 'float64'
df1['avg_frequency_login_days'] = df1['avg_frequency_login_days'].astype('float64')

In [373]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,36992.0,37.118161,15.867412,10.0,23.0,37.0,51.0,64.0
days_since_last_login,36992.0,-41.915576,228.8199,-999.0,8.0,12.0,16.0,26.0
avg_time_spent,36992.0,243.472334,398.289149,-2814.10911,60.1025,161.765,356.515,3235.578521
avg_transaction_value,36992.0,29271.194003,19444.806226,800.46,14177.54,27554.485,40855.11,99914.05
points_in_wallet,33549.0,686.882199,194.063624,-760.661236,616.15,697.62,763.95,2069.069761
churn_risk_score,36992.0,3.463397,1.409661,-1.0,3.0,4.0,5.0,5.0


In [374]:
# checking for NaN or Null values in the dataest
df1.isnull().sum() / df.shape[0] * 100

age                              0.000000
gender                           0.000000
region_category                 14.673443
membership_category              0.000000
joined_through_referral          0.000000
preferred_offer_types            0.778547
medium_of_operation              0.000000
internet_option                  0.000000
days_since_last_login            0.000000
avg_time_spent                   0.000000
avg_transaction_value            0.000000
avg_frequency_login_days         0.000000
points_in_wallet                 9.307418
used_special_discount            0.000000
offer_application_preference     0.000000
past_complaint                   0.000000
complaint_status                 0.000000
feedback                         0.000000
churn_risk_score                 0.000000
dtype: float64

In [94]:
# Shape of dataset
df1.shape

(36992, 19)

In [101]:
# Converting Categorical columns into Nominal using get_dummies()
df2 = pd.get_dummies(df1)
df2.shape

(36992, 49)

In [375]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 49 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   age                                             36992 non-null  int64  
 1   days_since_last_login                           36992 non-null  int64  
 2   avg_time_spent                                  36992 non-null  float64
 3   avg_transaction_value                           36992 non-null  float64
 4   avg_frequency_login_days                        33470 non-null  float64
 5   points_in_wallet                                33549 non-null  float64
 6   churn_risk_score                                36992 non-null  int64  
 7   gender_F                                        36992 non-null  uint8  
 8   gender_M                                        36992 non-null  uint8  
 9   region_category_City                   

In [233]:
# Check classes in the Dependent variable
df2['churn_risk_score'].value_counts()

 3    10424
 4    10185
 5     9827
 2     2741
 1     2652
-1     1163
Name: churn_risk_score, dtype: int64

In [376]:
# All NaN or Null values are Imputed using Iterative Imputer by taking LinearRegression as base estimator
from sklearn.experimental import enable_iterative_imputer
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

it = IterativeImputer(estimator=LinearRegression())
newdata = pd.DataFrame(it.fit_transform(df2))
newdata.columns = df2.columns
newdata.head()

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,churn_risk_score,gender_F,gender_M,region_category_City,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,18.0,17.0,300.63,53005.25,17.0,781.75,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,32.0,16.0,306.34,12838.38,10.0,792.958616,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,44.0,14.0,516.16,21027.0,22.0,500.69,5.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,37.0,11.0,53.27,25239.56,6.0,567.66,5.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,31.0,20.0,113.13,24483.66,16.0,663.06,5.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [182]:
# Check null values count
newdata.isnull().sum().sum()

0

In [108]:
# Camparing statistical summary for dataset before and after Iterative Imputation
newdata.describe()

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,churn_risk_score,gender_F,gender_M,region_category_City,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
count,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,...,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0
mean,37.118161,-41.915576,243.472334,29271.194003,15.97525,686.817835,3.463397,0.499838,0.498567,0.344318,...,0.125541,0.170037,0.16901,0.171659,0.169523,0.037359,0.036765,0.038306,0.169739,0.037603
std,15.867412,228.8199,398.289149,19444.806226,8.800295,185.754718,1.409661,0.500007,0.500005,0.475152,...,0.331335,0.37567,0.374765,0.377089,0.375218,0.189644,0.188186,0.191936,0.375409,0.190236
min,10.0,-999.0,-2814.10911,800.46,-43.652702,-760.661236,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,8.0,60.1025,14177.54,10.0,617.385,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37.0,12.0,161.765,27554.485,16.581608,698.855,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,16.0,356.515,40855.11,22.0,760.4725,5.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,64.0,26.0,3235.578521,99914.05,73.061995,2069.069761,5.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [109]:
df2.describe()

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,churn_risk_score,gender_F,gender_M,region_category_City,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
count,36992.0,36992.0,36992.0,36992.0,33470.0,33549.0,36992.0,36992.0,36992.0,36992.0,...,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0,36992.0
mean,37.118161,-41.915576,243.472334,29271.194003,15.976715,686.882199,3.463397,0.499838,0.498567,0.344318,...,0.125541,0.170037,0.16901,0.171659,0.169523,0.037359,0.036765,0.038306,0.169739,0.037603
std,15.867412,228.8199,398.289149,19444.806226,9.215858,194.063624,1.409661,0.500007,0.500005,0.475152,...,0.331335,0.37567,0.374765,0.377089,0.375218,0.189644,0.188186,0.191936,0.375409,0.190236
min,10.0,-999.0,-2814.10911,800.46,-43.652702,-760.661236,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,8.0,60.1025,14177.54,9.0,616.15,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,37.0,12.0,161.765,27554.485,16.0,697.62,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,51.0,16.0,356.515,40855.11,23.0,763.95,5.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,64.0,26.0,3235.578521,99914.05,73.061995,2069.069761,5.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [110]:
newdata.columns

Index(['age', 'days_since_last_login', 'avg_time_spent',
       'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet',
       'churn_risk_score', 'gender_F', 'gender_M', 'region_category_City',
       'region_category_Town', 'region_category_Village',
       'membership_category_Basic Membership',
       'membership_category_Gold Membership',
       'membership_category_No Membership',
       'membership_category_Platinum Membership',
       'membership_category_Premium Membership',
       'membership_category_Silver Membership', 'joined_through_referral_No',
       'joined_through_referral_Yes',
       'preferred_offer_types_Credit/Debit Card Offers',
       'preferred_offer_types_Gift Vouchers/Coupons',
       'preferred_offer_types_Without Offers', 'medium_of_operation_Both',
       'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
       'internet_option_Fiber_Optic', 'internet_option_Mobile_Data',
       'internet_option_Wi-Fi', 'used_special_disco

In [377]:
# Modified dataset shape
newdata.shape

(36992, 49)

# Using Lightgbm algorithm and Pipeline

In [378]:
# By ingoring '-1' class in dependent variable
newdata = newdata[newdata['churn_risk_score']!= -1]
newdata.shape

(35829, 49)

In [379]:
# With Pipeline with power train
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
import lightgbm as lgb

y = newdata['churn_risk_score']
X = newdata.drop('churn_risk_score', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .30,random_state=1)
pipe = Pipeline((
("pt",PowerTransformer()),
("lgb", lgb.LGBMClassifier()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))

Training R2
0.9440988835725678
Testing R2
0.8037026700158154


# Using Gradient Boosting Classifier algorithm and Pipeline

In [381]:
# By ingoring '-1' class in dependent variable
df3 = df2[df2['churn_risk_score']!= -1]
df3.shape

(35829, 49)

In [382]:
# With Pipeline with power train
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import GradientBoostingClassifier

y = df3['churn_risk_score']
X = df3.drop('churn_risk_score', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .30,random_state=1)
pipe = Pipeline((
("it",IterativeImputer(estimator=LinearRegression())),
("pt",PowerTransformer()),
("gb", GradientBoostingClassifier()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))

Training R2
0.8163078149920255
Testing R2
0.7871429900455856


In [383]:
# Copied df2 dataframe as df3
df3 = df2.copy()

In [384]:
# '-1' class is considered as '1' class and model is eveluating
df3['churn_risk_score'].replace(-1,1, inplace=True)
df3['churn_risk_score'].value_counts()

3    10424
4    10185
5     9827
1     3815
2     2741
Name: churn_risk_score, dtype: int64

In [385]:
# With Pipeline with power train
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import GradientBoostingClassifier

y = df3['churn_risk_score']
X = df3.drop('churn_risk_score', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .30,random_state=1)
pipe = Pipeline((
("it",IterativeImputer(estimator=LinearRegression())),
("pt",PowerTransformer()),
("gb", GradientBoostingClassifier()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))

Training R2
0.790105816019155
Testing R2
0.7704991890430708


In [386]:
# Copied df2 dataframe as df3
df4 = df2.copy()

In [387]:
# '-1' class is considered as '1' class and model is eveluating
df4['churn_risk_score'].replace(-1,5, inplace=True)
df4['churn_risk_score'].value_counts()

5    10990
3    10424
4    10185
2     2741
1     2652
Name: churn_risk_score, dtype: int64

In [388]:
# With Pipeline with power train
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier

y = df4['churn_risk_score']
X = df4.drop('churn_risk_score', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .30,random_state=1)
pipe = Pipeline((
("it",IterativeImputer(estimator=LinearRegression())),
("pt",PowerTransformer()),
("gb", GradientBoostingClassifier()),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))

Training R2
0.7946628562601374
Testing R2
0.780140565867724


In [389]:
# Tuning the model by makung '1' class
# With Pipeline with power train
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier

y = df3['churn_risk_score']
X = df3.drop('churn_risk_score', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .30,random_state=1)
pipe = Pipeline((
("it",IterativeImputer(estimator=LinearRegression())),
("pt",PowerTransformer()),
("gb", GradientBoostingClassifier(max_features='auto', n_estimators=150)),
))
pipe.fit(Xtrain,ytrain)
print("Training R2")
print(pipe.score(Xtrain,ytrain))
print("Testing R2")
print(pipe.score(Xtest,ytest))

Training R2
0.8044334594886846
Testing R2
0.7668949360245089


# Test set EDA and Model Evaluation

In [127]:
test_df = pd.read_csv('test.csv')
test_df.head(2)

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,...,12,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality
1,fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,...,11,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website


In [128]:
test_df.shape

(19919, 24)

In [129]:
test_df = test_df.drop(['security_no','Name','referral_id','joining_date','last_visit_time','customer_id'], axis=1)
test_df.head(2)

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,50,F,Village,Premium Membership,No,Without Offers,Smartphone,Wi-Fi,12,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality
1,41,M,Village,Gold Membership,No,Without Offers,Desktop,Fiber_Optic,11,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website


In [130]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19919 entries, 0 to 19918
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           19919 non-null  int64  
 1   gender                        19919 non-null  object 
 2   region_category               16971 non-null  object 
 3   membership_category           19919 non-null  object 
 4   joined_through_referral       19919 non-null  object 
 5   preferred_offer_types         19760 non-null  object 
 6   medium_of_operation           19919 non-null  object 
 7   internet_option               19919 non-null  object 
 8   days_since_last_login         19919 non-null  int64  
 9   avg_time_spent                19919 non-null  float64
 10  avg_transaction_value         19919 non-null  float64
 11  avg_frequency_login_days      19919 non-null  object 
 12  points_in_wallet              17956 non-null  float64
 13  u

In [390]:
test_df['feedback'].value_counts()

No reason specified         3423
Poor Customer Service       3410
Too many ads                3401
Poor Product Quality        3374
Poor Website                3301
Reasonable Price             788
Products always in Stock     778
Quality Customer Care        755
User Friendly Website        689
Name: feedback, dtype: int64

Similar EDA operations are performed on Testset as performed for Trainset and not shown here for all the features.

In [391]:
# Check any unknown values or special symbols for categorical features in dataset
# Replace 'Unknown' with NaN values
test_df['gender'].value_counts()

F    9972
M    9911
Name: gender, dtype: int64

In [392]:
# In this column no unknown or special symbols are found
test_df['region_category'].value_counts()

Town       7635
City       6792
Village    2544
Name: region_category, dtype: int64

In [393]:
# Check any unknown values or special symbols for categorical features in dataset
test_df['membership_category'].value_counts()

No Membership          4123
Basic Membership       4084
Gold Membership        3750
Silver Membership      3199
Platinum Membership    2398
Premium Membership     2365
Name: membership_category, dtype: int64

In [153]:
# Checking Null values in terms of percentage
test_df.isnull().sum() / test_df.shape[0] * 100

age                              0.000000
gender                           0.180732
region_category                 14.799940
membership_category              0.000000
joined_through_referral         15.050956
preferred_offer_types            0.798233
medium_of_operation             14.880265
internet_option                  0.000000
days_since_last_login            0.000000
avg_time_spent                   0.000000
avg_transaction_value            0.000000
avg_frequency_login_days         9.613936
points_in_wallet                 9.854912
used_special_discount            0.000000
offer_application_preference     0.000000
past_complaint                   0.000000
complaint_status                 0.000000
feedback                         0.000000
dtype: float64

In [154]:
test_df.shape

(19919, 18)

In [155]:
test_df2 = pd.get_dummies(test_df)
test_df2.shape

(19919, 48)

In [156]:
test_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19919 entries, 0 to 19918
Data columns (total 48 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   age                                             19919 non-null  int64  
 1   days_since_last_login                           19919 non-null  int64  
 2   avg_time_spent                                  19919 non-null  float64
 3   avg_transaction_value                           19919 non-null  float64
 4   avg_frequency_login_days                        18004 non-null  float64
 5   points_in_wallet                                17956 non-null  float64
 6   gender_F                                        19919 non-null  uint8  
 7   gender_M                                        19919 non-null  uint8  
 8   region_category_City                            19919 non-null  uint8  
 9   region_category_Town                   

In [158]:
#Missing Values Imputation using Iterative Imputer
from sklearn.experimental import enable_iterative_imputer
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

it = IterativeImputer(estimator=LinearRegression())
test_df3 = pd.DataFrame(it.fit_transform(test_df2))
test_df3.columns = test_df2.columns
test_df3.head()

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,gender_F,gender_M,region_category_City,region_category_Town,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,50.0,12.0,386.26,40721.44,7.0,733.83,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,41.0,11.0,37.8,9644.4,9.0,726.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,31.0,18.0,215.36,3693.25,21.0,713.78,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,64.0,-999.0,44.57,36809.56,11.0,744.97,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,16.0,6.0,349.88,40675.86,8.0,299.048351,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Evaluation for best model obtained from above Trainset

In [347]:
label = pipe.predict(test_df3)
label.shape

(19919,)

In [348]:
# Converting file into .csv for uploading
df_test = pd.read_csv('test.csv')
sample_submission = df_test[['customer_id']]
sample_submission['churn_risk_score'] = label
sample_submission.to_csv('sample_submission.csv')