In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [2]:

# Load the data
df = pd.read_csv('cleaned_data.csv')

# Drop rows with missing churn_risk_score or handle missing data
df.dropna(subset=['churn_risk_score'], inplace=True)


In [3]:

if df['customer_id'].is_unique:

    df.set_index('customer_id', inplace=True)

    print("Successfully set 'customer_id' as index.")

else:

    print("Cannot set 'customer_id' as index, duplicates still present.")
    


Successfully set 'customer_id' as index.


In [4]:

# Feature encoding for categorical columns
categorical_cols = ['gender', 'region_category',  
                    'joined_through_referral', 'preferred_offer_types', 
                    'medium_of_operation', 'internet_option', 
                    'used_special_discount', 'offer_application_preference', 
                    'past_complaint', 'complaint_status', 'feedback']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [5]:

# Convert 'joining_date' to datetime and create a 'days_since_joining' feature
df['joining_date'] = pd.to_datetime(df['joining_date'])
df['days_since_joining'] = (pd.to_datetime('today') - df['joining_date']).dt.days
df.drop(columns=['joining_date'], inplace=True)


In [6]:
# ordinal label encoding
membership_mapping = {
    'No Membership': 0,
    'Basic Membership': 1,
    'Silver Membership': 2,
    'Gold Membership': 3,
    'Platinum Membership': 4,
    'Premium Membership':5
}

df['membership_category'] = df['membership_category'].map(membership_mapping)



In [7]:

df.columns


Index(['age', 'membership_category', 'days_since_last_login', 'avg_time_spent',
       'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet',
       'churn_risk_score', 'dataset', 'gender_M', 'gender_Unknown',
       'region_category_Town', 'region_category_Unknown',
       'region_category_Village', 'joined_through_referral_Yes',
       'preferred_offer_types_Gift Vouchers/Coupons',
       'preferred_offer_types_Unknown', 'preferred_offer_types_Without Offers',
       'medium_of_operation_Desktop', 'medium_of_operation_Smartphone',
       'medium_of_operation_Unknown', 'internet_option_Mobile_Data',
       'internet_option_Wi-Fi', 'used_special_discount_Yes',
       'offer_application_preference_Yes', 'past_complaint_Yes',
       'complaint_status_Not Applicable', 'complaint_status_Solved',
       'complaint_status_Solved in Follow-up', 'complaint_status_Unsolved',
       'feedback_Poor Customer Service', 'feedback_Poor Product Quality',
       'feedback_Poor Website

In [8]:
df = df[df['churn_risk_score'] != 0]




In [9]:

# select only points_in_wallet, membership_category_encoded, feedback , avg_transaction_value , avg_time_spent , avg_frequency_login_days , age , days_since_last_login 
selected_columns = ['points_in_wallet', 'membership_category', 'feedback_Poor Customer Service', 'feedback_Poor Product Quality',
       'feedback_Poor Website', 'feedback_Products always in Stock',
       'feedback_Quality Customer Care', 'feedback_Reasonable Price',
       'feedback_Too many ads', 'feedback_User Friendly Website', 'avg_transaction_value', 'avg_time_spent', 'avg_frequency_login_days', 'age', 'days_since_last_login']
df_selected = df[selected_columns]
df_selected.columns
print(df_selected.head()) 

                                      points_in_wallet  membership_category  \
customer_id                                                                   
fffe4300490044003600300030003800                781.75                    4   
fffe4300490044003100390032003600                500.69                    0   
fffe43004900440036003000330031003600            567.66                    0   
fffe43004900440031003900350030003600            663.06                    0   
fffe43004900440036003300320035003300            722.27                    3   

                                      feedback_Poor Customer Service  \
customer_id                                                            
fffe4300490044003600300030003800                               False   
fffe4300490044003100390032003600                               False   
fffe43004900440036003000330031003600                           False   
fffe43004900440031003900350030003600                           False   
fffe4300490044

In [10]:

# Separate features and target variable
X = df.drop(columns=['churn_risk_score', 'dataset'])
y = df['churn_risk_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   #random_state is like setting a seed

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression model (multinomial)
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_pred,y_test))


Accuracy: 0.6280497402801826
              precision    recall  f1-score   support

         1.0       0.60      0.56      0.58       463
         2.0       0.59      0.62      0.60       462
         3.0       0.77      0.77      0.77      1845
         4.0       0.45      0.48      0.47      1812
         5.0       0.69      0.65      0.67      1771

    accuracy                           0.63      6353
   macro avg       0.62      0.62      0.62      6353
weighted avg       0.63      0.63      0.63      6353

[[ 261  175    0    0    0]
 [ 202  287    0    0    0]
 [   0    0 1419  433    0]
 [   0    0  426  864  612]
 [   0    0    0  515 1159]]


In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression model (multinomial)
model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6280497402801826
              precision    recall  f1-score   support

         1.0       0.60      0.56      0.58       463
         2.0       0.59      0.62      0.61       462
         3.0       0.77      0.77      0.77      1845
         4.0       0.45      0.48      0.47      1812
         5.0       0.69      0.65      0.67      1771

    accuracy                           0.63      6353
   macro avg       0.62      0.62      0.62      6353
weighted avg       0.63      0.63      0.63      6353

[[ 261  202    0    0    0]
 [ 174  288    0    0    0]
 [   0    0 1419  426    0]
 [   0    0  433  864  515]
 [   0    0    0  613 1158]]


In [12]:
from imblearn.over_sampling import RandomOverSampler

# Oversample the training set only
oversample = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = oversample.fit_resample(X_train, y_train)

# Check the distribution after oversampling
print(y_train_resampled.value_counts())


churn_risk_score
3.0    7408
5.0    7408
2.0    7408
4.0    7408
1.0    7408
Name: count, dtype: int64


In [13]:
# Logistic Regression with class_weight set to 'balanced'
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred_resampled = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred_resampled))
print(confusion_matrix(y_test, y_pred_resampled))


              precision    recall  f1-score   support

         1.0       0.60      0.56      0.58       463
         2.0       0.59      0.62      0.61       462
         3.0       0.77      0.77      0.77      1845
         4.0       0.45      0.48      0.47      1812
         5.0       0.69      0.65      0.67      1771

    accuracy                           0.63      6353
   macro avg       0.62      0.62      0.62      6353
weighted avg       0.63      0.63      0.63      6353

[[ 261  202    0    0    0]
 [ 174  288    0    0    0]
 [   0    0 1419  426    0]
 [   0    0  433  864  515]
 [   0    0    0  613 1158]]


In [14]:
# Logistic Regression with class_weight set to 'balanced'
model = LogisticRegression(C= 0.1,multi_class='multinomial', solver='saga', max_iter=1000)
model.fit(X_train, y_train)


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred_resampled = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred_resampled))
print(confusion_matrix(y_test, y_pred_resampled))


              precision    recall  f1-score   support

         1.0       0.60      0.56      0.58       463
         2.0       0.59      0.62      0.60       462
         3.0       0.77      0.77      0.77      1845
         4.0       0.46      0.48      0.47      1812
         5.0       0.69      0.66      0.68      1771

    accuracy                           0.63      6353
   macro avg       0.62      0.62      0.62      6353
weighted avg       0.63      0.63      0.63      6353

[[ 260  203    0    0    0]
 [ 175  287    0    0    0]
 [   0    0 1422  423    0]
 [   0    0  433  863  516]
 [   0    0    0  604 1167]]


In [15]:
# from sklearn.model_selection import GridSearchCV

# # Set the parameters for tuning
# param_grid = {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'saga'], 'class_weight': [None, 'balanced']}

# grid_search = GridSearchCV(LogisticRegression(multi_class='multinomial', max_iter=1000), param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# print(f"Best parameters: {grid_search.best_params_}")


                                      points_in_wallet  membership_category  \
customer_id                                                                   
fffe4300490044003600300030003800                781.75                    4   
fffe4300490044003100390032003600                500.69                    0   
fffe43004900440036003000330031003600            567.66                    0   
fffe43004900440031003900350030003600            663.06                    0   
fffe43004900440036003300320035003300            722.27                    3   

                                      feedback_Poor Customer Service  \
customer_id                                                            
fffe4300490044003600300030003800                               False   
fffe4300490044003100390032003600                               False   
fffe43004900440036003000330031003600                           False   
fffe43004900440031003900350030003600                           False   
fffe4300490044